kindle manager
This commit is contained in:
79
tbook.py
79
tbook.py
@@ -138,7 +138,7 @@ class doubanSpide():
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
[re_bn,re_bn,re_rate,re_norate,re_author] = [None,None,None,None,None]
|
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
|
||||||
if spidetp==0:
|
if spidetp==0:
|
||||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
||||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
||||||
@@ -146,12 +146,15 @@ class doubanSpide():
|
|||||||
re_norate = re.compile(r'''class=\"allstar00\"''')
|
re_norate = re.compile(r'''class=\"allstar00\"''')
|
||||||
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
|
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
|
||||||
else:
|
else:
|
||||||
|
re_asin = re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
|
||||||
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
|
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
|
||||||
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
|
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
|
||||||
# author by split
|
re_author = re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
|
||||||
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
re_rate = re.compile(r'''^<span aria-label=\"(.+?)\">$''')
|
||||||
re_norate = re.compile(r'''class=\"allstar00\"''')
|
#re_end = re.compile(r'''<\/body><\/html>''')
|
||||||
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
|
re_end = re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
|
||||||
|
|
||||||
|
#amazon ASIN B07VKS1DRZ - https://www.amazon.cn/s?k=B07VKS1DRZ
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
@@ -171,6 +174,7 @@ class doubanSpide():
|
|||||||
mparams['search_text'] = mbkn
|
mparams['search_text'] = mbkn
|
||||||
else: #amazon
|
else: #amazon
|
||||||
mparams['k'] = mbkn
|
mparams['k'] = mbkn
|
||||||
|
|
||||||
r = requests.get( url=murl, headers=mheaders, params=mparams)
|
r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
@@ -178,16 +182,17 @@ class doubanSpide():
|
|||||||
|
|
||||||
bkinfo = defaultdict(dict)
|
bkinfo = defaultdict(dict)
|
||||||
sid = None
|
sid = None
|
||||||
stat = 'NAME'
|
stat = None
|
||||||
|
|
||||||
resp = r.text
|
resp = r.text
|
||||||
|
|
||||||
if spidetp==0:
|
if spidetp==0:
|
||||||
|
stat = 'SID'
|
||||||
for line in resp.split('\n'):
|
for line in resp.split('\n'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line=='': continue
|
if line=='': continue
|
||||||
|
|
||||||
if stat=='NAME':
|
if stat=='SID':
|
||||||
ret=re.search(self.re_bn, line)
|
ret=re.search(self.re_bn, line)
|
||||||
if ret:
|
if ret:
|
||||||
sid = ret.group(1)
|
sid = ret.group(1)
|
||||||
@@ -198,7 +203,7 @@ class doubanSpide():
|
|||||||
elif stat=='RATE':
|
elif stat=='RATE':
|
||||||
# if no rate, goto next bookname state
|
# if no rate, goto next bookname state
|
||||||
if re.search(self.re_norate, line):
|
if re.search(self.re_norate, line):
|
||||||
stat = 'NAME'
|
stat = 'SID'
|
||||||
continue
|
continue
|
||||||
ret=re.search(self.re_rate, line)
|
ret=re.search(self.re_rate, line)
|
||||||
if ret:
|
if ret:
|
||||||
@@ -209,30 +214,78 @@ class doubanSpide():
|
|||||||
ret=re.search(self.re_author, line)
|
ret=re.search(self.re_author, line)
|
||||||
if ret:
|
if ret:
|
||||||
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
|
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
|
||||||
stat = 'NAME'
|
stat = 'SID'
|
||||||
continue
|
|
||||||
else: continue
|
else: continue
|
||||||
else:
|
else:
|
||||||
logger.debug('================ {} ================'.format(mbkn))
|
stat='ASIN'
|
||||||
for line in resp.split('\n'):
|
for line in resp.split('\n'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line=='': continue
|
if line=='': continue
|
||||||
|
|
||||||
logger.debug(line)
|
if stat=='ASIN':
|
||||||
|
ret=re.search(self.re_asin, line)
|
||||||
|
if ret:
|
||||||
|
sid = ret.group(1)
|
||||||
|
stat = 'IMG'
|
||||||
|
continue
|
||||||
|
elif stat=='IMG':
|
||||||
|
ret=re.search(self.re_img, line)
|
||||||
|
if ret:
|
||||||
|
bkinfo[sid]['img'] = ret.group(1)
|
||||||
|
stat = 'BOOKNAME'
|
||||||
|
continue
|
||||||
|
elif stat=='BOOKNAME':
|
||||||
|
ret=re.search(self.re_bn, line)
|
||||||
|
if ret:
|
||||||
|
bkname = re.split(r'[((\s]',ret.group(1).strip())[0]
|
||||||
|
bkinfo[sid]['bookname'] = bkname
|
||||||
|
stat = 'AUTHOR'
|
||||||
|
continue
|
||||||
|
elif stat=='AUTHOR':
|
||||||
|
ret=re.search(self.re_author, line)
|
||||||
|
if ret:
|
||||||
|
author = ','.join(re.split('<span.+?auto\">|<\/span', ret.group(0))[3::4])
|
||||||
|
bkinfo[sid]['author'] = author
|
||||||
|
stat = 'RATE'
|
||||||
|
continue
|
||||||
|
elif stat=='RATE':
|
||||||
|
ret=re.search(self.re_rate, line)
|
||||||
|
if ret:
|
||||||
|
bkinfo[sid]['rate'] = ret.group(1).split(' ')[0]
|
||||||
|
stat = 'AUTHOR'
|
||||||
|
continue
|
||||||
|
else: continue
|
||||||
|
|
||||||
|
if re.search(self.re_end, line):
|
||||||
|
stat=='ASIN'
|
||||||
|
continue
|
||||||
|
|
||||||
return bkinfo
|
return bkinfo
|
||||||
|
|
||||||
def filter_spide_books(self, mbkn, mbkinfo):
|
def filter_spide_books(self, mbkn, mbkinfo):
|
||||||
""" mbkn - bookname to be spide
|
""" mbkn - bookname to be spide
|
||||||
mbkinfo: {
|
mbkinfo:
|
||||||
|
douban
|
||||||
|
{
|
||||||
"25853071": { # sid
|
"25853071": { # sid
|
||||||
"bookname": "庆余年",
|
"bookname": "庆余年",
|
||||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||||
"rate": "8.0",
|
"rate": "8.0",
|
||||||
"author": "猫腻"
|
"author": "猫腻"
|
||||||
},...}
|
},...}
|
||||||
|
amazon
|
||||||
|
"B07RN73425": {
|
||||||
|
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
|
||||||
|
"bookname": "古典名著普及文库:孟子",
|
||||||
|
"author": "孙钦善",
|
||||||
|
"rate": "3.9"
|
||||||
|
}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
#booklink - https://book.douban.com/subject/{sid}
|
#booklink - https://book.douban.com/subject/{sid}
|
||||||
|
# f1/d1: mbkn include in bookname
|
||||||
|
# f2/d2: bookname include mbkn
|
||||||
|
# f3/d3: mbkn and bookname different
|
||||||
[f1,f2,f3] = [0,0,0]
|
[f1,f2,f3] = [0,0,0]
|
||||||
[d1,d2,d3] = [{},{},{}]
|
[d1,d2,d3] = [{},{},{}]
|
||||||
for k,v in mbkinfo.items():
|
for k,v in mbkinfo.items():
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
#########################################################
|
||||||
|
## @file : tparseamazon.py
|
||||||
|
## @desc : test parse amazon response
|
||||||
|
## @create : 2020/05/26
|
||||||
|
## @author : Chengan
|
||||||
|
## @email : douboer@gmail.com
|
||||||
|
#########################################################
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
@@ -11,9 +18,33 @@ s =['''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base
|
|||||||
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">堀田江理(Eri Hotta)</span></div>''',
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">堀田江理(Eri Hotta)</span></div>''',
|
||||||
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">景跃进</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">张小劲</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">余逊达</span></div>''']
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">景跃进</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">张小劲</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">余逊达</span></div>''']
|
||||||
|
|
||||||
|
tre = ['''<div data-asin="B07S46LMCK" data-index="3" data-uuid="ef76dd52-2ab2-4372-8597-2c3258698b6e" data-component-type="s-search-result" class="sg-col-20-of-24 s-result-item s-asin sg-col-0-of-12 sg-col-28-of-32 sg-col-16-of-20 sg-col sg-col-32-of-36 sg-col-12-of-16 sg-col-24-of-28"><div class="sg-col-inner">''',
|
||||||
|
'''<img src="https://images-cn.ssl-images-amazon.com/images/I/41JFUwuJPCL._AC_UY218_.jpg"''',
|
||||||
|
'''alt="小窗幽记(国学大书院)(为人处世的智慧之果 修身齐家的行动指南)"''',
|
||||||
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">野田洋次郎</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">蒋青青</span></div>''',
|
||||||
|
'''<span aria-label="4.7 颗星,最多 5 颗星">''',
|
||||||
|
'''<span class="a-letter-space"></span></div></div>''']
|
||||||
|
|
||||||
|
|
||||||
for t in s:
|
for t in s:
|
||||||
ret = re.split('<span.+?auto\">|<\/span',t)
|
ret = re.split('<span.+?auto\">|<\/span',t)
|
||||||
fret = ret[3::4]
|
fret = ret[3::4]
|
||||||
#print(json.dumps(re.split('<span.+?auto\">|<\/span',t), indent=2, ensure_ascii=False))
|
#print(json.dumps(re.split('<span.+?auto\">|<\/span',t), indent=2, ensure_ascii=False))
|
||||||
print(','.join(fret))
|
print(','.join(fret))
|
||||||
|
|
||||||
|
re_asin = re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
|
||||||
|
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
|
||||||
|
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
|
||||||
|
re_author = re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
|
||||||
|
re_rate = re.compile(r'''^<span aria-label=\"(.+?)\">$''')
|
||||||
|
#re_end = re.compile(r'''<\/body><\/html>''')
|
||||||
|
re_end = re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
|
||||||
|
|
||||||
|
print(re.search(re_asin, tre[0]).group(1))
|
||||||
|
print(re.search(re_img , tre[1]).group(1))
|
||||||
|
print(re.search(re_bn , tre[2]).group(1))
|
||||||
|
print(re.search(re_author,tre[3]).group(0))
|
||||||
|
print(re.search(re_rate, tre[4]).group(1))
|
||||||
|
print(re.search(re_end , tre[5]).group(0))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user