kindle manager

This commit is contained in:
gavin
2020-06-14 18:21:40 +08:00
parent 37294e9da2
commit 85fc992180
3 changed files with 123 additions and 2978 deletions

View File

@@ -138,7 +138,7 @@ class doubanSpide():
"""
[re_bn,re_bn,re_rate,re_norate,re_author] = [None,None,None,None,None]
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
if spidetp==0:
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
@@ -146,12 +146,15 @@ class doubanSpide():
re_norate = re.compile(r'''class=\"allstar00\"''')
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
else:
re_asin = re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
# author by split
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
re_norate = re.compile(r'''class=\"allstar00\"''')
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
re_author = re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
re_rate = re.compile(r'''^<span aria-label=\"(.+?)\">$''')
#re_end = re.compile(r'''<\/body><\/html>''')
re_end = re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
#amazon ASIN B07VKS1DRZ - https://www.amazon.cn/s?k=B07VKS1DRZ
def __init__(self):
pass
@@ -171,6 +174,7 @@ class doubanSpide():
mparams['search_text'] = mbkn
else: #amazon
mparams['k'] = mbkn
r = requests.get( url=murl, headers=mheaders, params=mparams)
if r.status_code != 200:
@@ -178,16 +182,17 @@ class doubanSpide():
bkinfo = defaultdict(dict)
sid = None
stat = 'NAME'
stat = None
resp = r.text
if spidetp==0:
stat = 'SID'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
if stat=='NAME':
if stat=='SID':
ret=re.search(self.re_bn, line)
if ret:
sid = ret.group(1)
@@ -198,7 +203,7 @@ class doubanSpide():
elif stat=='RATE':
# if no rate, goto next bookname state
if re.search(self.re_norate, line):
stat = 'NAME'
stat = 'SID'
continue
ret=re.search(self.re_rate, line)
if ret:
@@ -209,30 +214,78 @@ class doubanSpide():
ret=re.search(self.re_author, line)
if ret:
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
stat = 'NAME'
continue
stat = 'SID'
else: continue
else:
logger.debug('================ {} ================'.format(mbkn))
stat='ASIN'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
logger.debug(line)
if stat=='ASIN':
ret=re.search(self.re_asin, line)
if ret:
sid = ret.group(1)
stat = 'IMG'
continue
elif stat=='IMG':
ret=re.search(self.re_img, line)
if ret:
bkinfo[sid]['img'] = ret.group(1)
stat = 'BOOKNAME'
continue
elif stat=='BOOKNAME':
ret=re.search(self.re_bn, line)
if ret:
bkname = re.split(r'[(\s]',ret.group(1).strip())[0]
bkinfo[sid]['bookname'] = bkname
stat = 'AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
author = ','.join(re.split('<span.+?auto\">|<\/span', ret.group(0))[3::4])
bkinfo[sid]['author'] = author
stat = 'RATE'
continue
elif stat=='RATE':
ret=re.search(self.re_rate, line)
if ret:
bkinfo[sid]['rate'] = ret.group(1).split(' ')[0]
stat = 'AUTHOR'
continue
else: continue
if re.search(self.re_end, line):
stat=='ASIN'
continue
return bkinfo
def filter_spide_books(self, mbkn, mbkinfo):
""" mbkn - bookname to be spide
mbkinfo: {
mbkinfo:
douban
{
"25853071": { # sid
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
},...}
amazon
"B07RN73425": {
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
"bookname": "古典名著普及文库:孟子",
"author": "孙钦善",
"rate": "3.9"
}
"""
#booklink - https://book.douban.com/subject/{sid}
# f1/d1: mbkn include in bookname
# f2/d2: bookname include mbkn
# f3/d3: mbkn and bookname different
[f1,f2,f3] = [0,0,0]
[d1,d2,d3] = [{},{},{}]
for k,v in mbkinfo.items():