kindle manager
This commit is contained in:
276
tbook.py
Normal file
276
tbook.py
Normal file
@@ -0,0 +1,276 @@
|
|||||||
|
|
||||||
|
#########################################################
|
||||||
|
## @file : tbook.py
|
||||||
|
## @desc : douban book spide
|
||||||
|
## @create : 2020/6/13
|
||||||
|
## @author : Chengan
|
||||||
|
## @email : douboer@gmail.com
|
||||||
|
#########################################################
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# log info
|
||||||
|
logger = logging.getLogger()
|
||||||
|
logger.addHandler(logging.FileHandler('log'))
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
spidetp = 1 # 0 - douban 1- amazon
|
||||||
|
|
||||||
|
mheaders = {
|
||||||
|
'Host': 'www.douban.com',
|
||||||
|
'Referer': 'http://www.douban.com',
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
mparams = {}
|
||||||
|
murl = ""
|
||||||
|
if spidetp==0:
|
||||||
|
mparams['Host']='www.douban.com',
|
||||||
|
mparams['search_text'] = 'bkname_xxx'
|
||||||
|
mparams['cat']='1001'
|
||||||
|
mparams['k']='bookname_xxx'
|
||||||
|
murl = "https://search.douban.com/book/subject_search"
|
||||||
|
else:
|
||||||
|
mheaders['Host']='www.amazon.cn'
|
||||||
|
mheaders['Referer']='http:/www.amazon.cn'
|
||||||
|
|
||||||
|
#https://www.amazon.cn/s?k={bookname}&i=stripbooks&__mk_zh_CN=亚马逊网站&ref=nb_sb_noss
|
||||||
|
mparams['Host']='www.amazon.cn'
|
||||||
|
mparams['k']='bkname_xxx'
|
||||||
|
mparams['i']='stripbooks'
|
||||||
|
mparams['__mk_zh_CN=']='亚马逊网站'
|
||||||
|
mparams['reg']='nb_sb_noss'
|
||||||
|
|
||||||
|
murl = 'https://www.amazon.cn/s'
|
||||||
|
|
||||||
|
testbooks =['24堂财富课',
|
||||||
|
'甲骨文',
|
||||||
|
'庆余年(精校版)',
|
||||||
|
'商君书-中华经典名著全本全注全译丛书',
|
||||||
|
'苏世民:我的经验与教训(2018读桥水达利欧的原则,2020看黑石苏世民的经验!一本书读懂从白手起家到华尔街新国王的传奇人生)',
|
||||||
|
'杨伯峻_论语译注',
|
||||||
|
'小窗幽记',
|
||||||
|
'少年凯歌',
|
||||||
|
'投资要义',
|
||||||
|
'白鱼解字',
|
||||||
|
'历史的巨镜',
|
||||||
|
'货币的教训',
|
||||||
|
'钱从哪里来',
|
||||||
|
'中国古代简史',
|
||||||
|
'罗马人的故事(套装共15册)',
|
||||||
|
'改变心理学的40项研究',
|
||||||
|
'如何假装懂音乐',
|
||||||
|
'管子(上下册)--中华经典名著全本全注全译(精)',
|
||||||
|
'投资中最简单的事',
|
||||||
|
'薛兆丰经济学讲义',
|
||||||
|
'枪炮、病菌与钢铁:人类社会的命运(世纪人文系列丛书·开放人文)',
|
||||||
|
'中央帝国的哲学密码',
|
||||||
|
'新编说文解字大全集(超值白金版)',
|
||||||
|
'市场的逻辑(增订本)',
|
||||||
|
'金融的本质:伯南克四讲美联储(看一个风云人物的金融思考)',
|
||||||
|
'从零开始学写作:个人增值的有效方法',
|
||||||
|
'中国国家治理的制度逻辑:一个组织学研究',
|
||||||
|
'中国为什么有前途:对外经济关系的战略潜能(第3版)',
|
||||||
|
'日本的世界观(《剑桥日本史》主编凝练之作三个人物故事串起日本两百年变局了解近代日本转向的必读之书理想国出品))']
|
||||||
|
|
||||||
|
testresp = """<div class="result">
|
||||||
|
<div class="pic">
|
||||||
|
<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&cat_id=1001&type=search&pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" title="解读中国经济" ><img src="https://img3.doubanio.com/view/subject/s/public/s29872890.jpg"></a>
|
||||||
|
</div>
|
||||||
|
<div class="content">
|
||||||
|
<div class="title">
|
||||||
|
<h3>
|
||||||
|
<span>[书籍]</span> <a href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&cat_id=1001&type=search&pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" >解读中国经济 </a>
|
||||||
|
</h3>
|
||||||
|
|
||||||
|
<div class="rating-info">
|
||||||
|
|
||||||
|
<span class="allstar45"></span>
|
||||||
|
<span class="rating_nums">9.4</span>
|
||||||
|
<span>(145人评价)</span>
|
||||||
|
|
||||||
|
<span class="subject-cast">林毅夫 / 北京大学出版社 / 2018</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<p>《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
div class="result">
|
||||||
|
<div class="pic">
|
||||||
|
<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&cat_id=1001&type=search&pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" title="解读中国经济" ><img src="https://img3.doubanio.com/view/subject/s/public/s29872890.jpg"></a>
|
||||||
|
</div>
|
||||||
|
<div class="content">
|
||||||
|
<div class="title">
|
||||||
|
<h3>
|
||||||
|
<span>[书籍]</span> <a href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&cat_id=1001&type=search&pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" >解读中国经济 </a>
|
||||||
|
</h3>
|
||||||
|
|
||||||
|
<div class="rating-info">
|
||||||
|
|
||||||
|
<span class="allstar45"></span>
|
||||||
|
<span class="rating_nums">9.4</span>
|
||||||
|
<span>(145人评价)</span>
|
||||||
|
|
||||||
|
<span class="subject-cast">林毅夫 / 北京大学出版社 / 2018</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<p>《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...</p>
|
||||||
|
</div>
|
||||||
|
</div>"""
|
||||||
|
|
||||||
|
class doubanSpide():
|
||||||
|
|
||||||
|
"""
|
||||||
|
re_bn = re.compile(r'''
|
||||||
|
class=\"nbg.+?sid: (\d+?),.+?
|
||||||
|
title=\"(.+?)\".+?
|
||||||
|
img src=\"(.+?)\".+?
|
||||||
|
rating_nums\">(.+?)<
|
||||||
|
''',flags=re.S|re.X)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
[re_bn,re_bn,re_rate,re_norate,re_author] = [None,None,None,None,None]
|
||||||
|
if spidetp==0:
|
||||||
|
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
||||||
|
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
||||||
|
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
||||||
|
re_norate = re.compile(r'''class=\"allstar00\"''')
|
||||||
|
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
|
||||||
|
else:
|
||||||
|
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
|
||||||
|
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
|
||||||
|
# author by split
|
||||||
|
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
||||||
|
re_norate = re.compile(r'''class=\"allstar00\"''')
|
||||||
|
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse_books(self, mbkn: str):
|
||||||
|
"""mbkn - bookname to be spided
|
||||||
|
return: {
|
||||||
|
"25853071": { # sid
|
||||||
|
"bookname": "庆余年",
|
||||||
|
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||||
|
"rate": "8.0",
|
||||||
|
"author": "猫腻"
|
||||||
|
},...}
|
||||||
|
"""
|
||||||
|
|
||||||
|
if spidetp==0: #douban
|
||||||
|
mparams['search_text'] = mbkn
|
||||||
|
else: #amazon
|
||||||
|
mparams['k'] = mbkn
|
||||||
|
r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||||||
|
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise Exception("请求失败")
|
||||||
|
|
||||||
|
bkinfo = defaultdict(dict)
|
||||||
|
sid = None
|
||||||
|
stat = 'NAME'
|
||||||
|
|
||||||
|
resp = r.text
|
||||||
|
|
||||||
|
if spidetp==0:
|
||||||
|
for line in resp.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line=='': continue
|
||||||
|
|
||||||
|
if stat=='NAME':
|
||||||
|
ret=re.search(self.re_bn, line)
|
||||||
|
if ret:
|
||||||
|
sid = ret.group(1)
|
||||||
|
bkinfo[sid]['bookname'] = ret.group(2)
|
||||||
|
bkinfo[sid]['img'] = ret.group(3)
|
||||||
|
stat = 'RATE'
|
||||||
|
continue
|
||||||
|
elif stat=='RATE':
|
||||||
|
# if no rate, goto next bookname state
|
||||||
|
if re.search(self.re_norate, line):
|
||||||
|
stat = 'NAME'
|
||||||
|
continue
|
||||||
|
ret=re.search(self.re_rate, line)
|
||||||
|
if ret:
|
||||||
|
bkinfo[sid]['rate'] = ret.group(1)
|
||||||
|
stat = 'AUTHOR'
|
||||||
|
continue
|
||||||
|
elif stat=='AUTHOR':
|
||||||
|
ret=re.search(self.re_author, line)
|
||||||
|
if ret:
|
||||||
|
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
|
||||||
|
stat = 'NAME'
|
||||||
|
continue
|
||||||
|
else: continue
|
||||||
|
else:
|
||||||
|
logger.debug('================ {} ================'.format(mbkn))
|
||||||
|
for line in resp.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line=='': continue
|
||||||
|
|
||||||
|
logger.debug(line)
|
||||||
|
|
||||||
|
return bkinfo
|
||||||
|
|
||||||
|
def filter_spide_books(self, mbkn, mbkinfo):
|
||||||
|
""" mbkn - bookname to be spide
|
||||||
|
mbkinfo: {
|
||||||
|
"25853071": { # sid
|
||||||
|
"bookname": "庆余年",
|
||||||
|
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||||
|
"rate": "8.0",
|
||||||
|
"author": "猫腻"
|
||||||
|
},...}
|
||||||
|
"""
|
||||||
|
#booklink - https://book.douban.com/subject/{sid}
|
||||||
|
[f1,f2,f3] = [0,0,0]
|
||||||
|
[d1,d2,d3] = [{},{},{}]
|
||||||
|
for k,v in mbkinfo.items():
|
||||||
|
bkn = v['bookname']
|
||||||
|
if len(v) == 4:
|
||||||
|
if (not f1) and (mbkn in bkn):
|
||||||
|
f1 = 1
|
||||||
|
d1 = {k:v}
|
||||||
|
elif (not f1) and (not f2) and (bkn in mbkn):
|
||||||
|
f2 = 1
|
||||||
|
d2 = {k:v}
|
||||||
|
elif (not f3):
|
||||||
|
f3 = 1
|
||||||
|
d3 = {k:v}
|
||||||
|
else: continue
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if f1:
|
||||||
|
return d1
|
||||||
|
elif f2:
|
||||||
|
return d2
|
||||||
|
elif f3:
|
||||||
|
return d3
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
spide = doubanSpide()
|
||||||
|
|
||||||
|
for bkname in testbooks:
|
||||||
|
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
|
||||||
|
print(bkname)
|
||||||
|
bkinfo = spide.parse_books(bkname)
|
||||||
|
filter_bkinfo = spide.filter_spide_books(bkname, bkinfo)
|
||||||
|
|
||||||
|
logger.debug('================ {} ================'.format(bkname))
|
||||||
|
#logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
|
||||||
|
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))
|
||||||
|
|
||||||
19
tparseamazon.py
Normal file
19
tparseamazon.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
|
s =['''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">丹·琼斯(Dan Jones)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">杰弗里·瓦夫罗(Geoffrey Wawro)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">克里斯托弗·希伯特(Christopher Hibbert)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">罗斯·金(Ross King)</span><span class="a-size-base" dir="auto">等等。</span></div>''',
|
||||||
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">马克·哈里斯</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">黎绮妮</span></div>''',
|
||||||
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">马克·哈里斯</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">黎绮妮</span></div>''',
|
||||||
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[美]威廉·厄本(William Urban)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">陆大鹏</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">刘晓晖</span></div>''',
|
||||||
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[英]安德鲁·罗伯茨(Andrew Roberts)</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">苏然</span></div>''',
|
||||||
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[英]安德鲁·罗伯茨(Andrew Roberts)</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">苏然</span></div>''',
|
||||||
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">堀田江理(Eri Hotta)</span></div>''',
|
||||||
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">景跃进</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">张小劲</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">余逊达</span></div>''']
|
||||||
|
|
||||||
|
for t in s:
|
||||||
|
ret = re.split('<span.+?auto\">|<\/span',t)
|
||||||
|
fret = ret[3::4]
|
||||||
|
#print(json.dumps(re.split('<span.+?auto\">|<\/span',t), indent=2, ensure_ascii=False))
|
||||||
|
print(','.join(fret))
|
||||||
|
|
||||||
BIN
vocab.empty.db
BIN
vocab.empty.db
Binary file not shown.
118
xx
Normal file
118
xx
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
|
||||||
|
|
||||||
|
[
|
||||||
|
"<div class=\"a-row a-size-base a-color-secondary\">",
|
||||||
|
"",
|
||||||
|
">",
|
||||||
|
"丹·琼斯(Dan Jones)",
|
||||||
|
">",
|
||||||
|
", ",
|
||||||
|
">",
|
||||||
|
"杰弗里·瓦夫罗(Geoffrey Wawro)",
|
||||||
|
">",
|
||||||
|
", ",
|
||||||
|
">",
|
||||||
|
"克里斯托弗·希伯特(Christopher Hibbert)",
|
||||||
|
">",
|
||||||
|
", ",
|
||||||
|
">",
|
||||||
|
"罗斯·金(Ross King)",
|
||||||
|
">",
|
||||||
|
"等等。",
|
||||||
|
"></div>"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[
|
||||||
|
"<div class=\"a-row a-size-base a-color-secondary\">",
|
||||||
|
"",
|
||||||
|
">",
|
||||||
|
"马克·哈里斯",
|
||||||
|
">",
|
||||||
|
"、 ",
|
||||||
|
">",
|
||||||
|
"黎绮妮",
|
||||||
|
"></div>"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[
|
||||||
|
"<div class=\"a-row a-size-base a-color-secondary\">",
|
||||||
|
"",
|
||||||
|
">",
|
||||||
|
"马克·哈里斯",
|
||||||
|
">",
|
||||||
|
"、 ",
|
||||||
|
">",
|
||||||
|
"黎绮妮",
|
||||||
|
"></div>"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[
|
||||||
|
"<div class=\"a-row a-size-base a-color-secondary\">",
|
||||||
|
"",
|
||||||
|
">",
|
||||||
|
"[美]威廉·厄本(William Urban)",
|
||||||
|
">",
|
||||||
|
", ",
|
||||||
|
">",
|
||||||
|
"陆大鹏",
|
||||||
|
">",
|
||||||
|
"、 ",
|
||||||
|
">",
|
||||||
|
"刘晓晖",
|
||||||
|
"></div>"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[
|
||||||
|
"<div class=\"a-row a-size-base a-color-secondary\">",
|
||||||
|
"",
|
||||||
|
">",
|
||||||
|
"[英]安德鲁·罗伯茨(Andrew Roberts)",
|
||||||
|
">",
|
||||||
|
"、 ",
|
||||||
|
">",
|
||||||
|
"苏然",
|
||||||
|
"></div>"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[
|
||||||
|
"<div class=\"a-row a-size-base a-color-secondary\">",
|
||||||
|
"",
|
||||||
|
">",
|
||||||
|
"[英]安德鲁·罗伯茨(Andrew Roberts)",
|
||||||
|
">",
|
||||||
|
"、 ",
|
||||||
|
">",
|
||||||
|
"苏然",
|
||||||
|
"></div>"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[
|
||||||
|
"<div class=\"a-row a-size-base a-color-secondary\">",
|
||||||
|
"",
|
||||||
|
">",
|
||||||
|
"堀田江理(Eri Hotta)",
|
||||||
|
"></div>"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[
|
||||||
|
"<div class=\"a-row a-size-base a-color-secondary\">",
|
||||||
|
"",
|
||||||
|
">",
|
||||||
|
"景跃进",
|
||||||
|
">",
|
||||||
|
", ",
|
||||||
|
">",
|
||||||
|
"张小劲",
|
||||||
|
">",
|
||||||
|
"、 ",
|
||||||
|
">",
|
||||||
|
"余逊达",
|
||||||
|
"></div>"
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user