kindle manager
This commit is contained in:
188
parseweb.py
188
parseweb.py
@@ -85,81 +85,24 @@ testbooks =['24堂财富课',
|
||||
'中国为什么有前途:对外经济关系的战略潜能(第3版)',
|
||||
'日本的世界观(《剑桥日本史》主编凝练之作三个人物故事串起日本两百年变局了解近代日本转向的必读之书理想国出品))']
|
||||
|
||||
testresp = """<div class="result">
|
||||
<div class="pic">
|
||||
<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&cat_id=1001&type=search&pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" title="解读中国经济" ><img src="https://img3.doubanio.com/view/subject/s/public/s29872890.jpg"></a>
|
||||
</div>
|
||||
<div class="content">
|
||||
<div class="title">
|
||||
<h3>
|
||||
<span>[书籍]</span> <a href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&cat_id=1001&type=search&pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" >解读中国经济 </a>
|
||||
</h3>
|
||||
|
||||
<div class="rating-info">
|
||||
|
||||
<span class="allstar45"></span>
|
||||
<span class="rating_nums">9.4</span>
|
||||
<span>(145人评价)</span>
|
||||
|
||||
<span class="subject-cast">林毅夫 / 北京大学出版社 / 2018</span>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<p>《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...</p>
|
||||
</div>
|
||||
</div>
|
||||
div class="result">
|
||||
<div class="pic">
|
||||
<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&cat_id=1001&type=search&pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" title="解读中国经济" ><img src="https://img3.doubanio.com/view/subject/s/public/s29872890.jpg"></a>
|
||||
</div>
|
||||
<div class="content">
|
||||
<div class="title">
|
||||
<h3>
|
||||
<span>[书籍]</span> <a href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&cat_id=1001&type=search&pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" >解读中国经济 </a>
|
||||
</h3>
|
||||
|
||||
<div class="rating-info">
|
||||
|
||||
<span class="allstar45"></span>
|
||||
<span class="rating_nums">9.4</span>
|
||||
<span>(145人评价)</span>
|
||||
|
||||
<span class="subject-cast">林毅夫 / 北京大学出版社 / 2018</span>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<p>《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...</p>
|
||||
</div>
|
||||
</div>"""
|
||||
|
||||
class bookInfoSpide():
|
||||
|
||||
"""
|
||||
re_bn = re.compile(r'''
|
||||
class=\"nbg.+?sid: (\d+?),.+?
|
||||
title=\"(.+?)\".+?
|
||||
img src=\"(.+?)\".+?
|
||||
rating_nums\">(.+?)<
|
||||
''',flags=re.S|re.X)
|
||||
"""
|
||||
|
||||
|
||||
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
|
||||
[re_bn,re_bn,re_score,re_star,re_author,re_end]=[None,None,None,None,None,None]
|
||||
if ISDOUBAN==1:
|
||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
||||
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
||||
re_norate = re.compile(r'''class=\"allstar00\"''')
|
||||
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
|
||||
re_bn=re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
||||
re_bn=re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
||||
re_star=re.compile(r'''^<span class=\"allstar(\d+)\"></span>''')
|
||||
re_score=re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
||||
re_ratenum=re.compile(r'''^<span>\((\d+)人评价\)</span>''')
|
||||
re_author=re.compile(r'''class=\"subject-cast\">(.+?)<''')
|
||||
else:
|
||||
re_asin = re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
|
||||
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
|
||||
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
|
||||
re_author = re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
|
||||
re_rate = re.compile(r'''^<span aria-label=\"(.+?)\">$''')
|
||||
#re_end = re.compile(r'''<\/body><\/html>''')
|
||||
re_end = re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
|
||||
|
||||
re_asin=re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
|
||||
re_img=re.compile(r'''^<img src=\"(.+?)\"$''')
|
||||
re_bn=re.compile(r'''^alt=\"(.+?)\"$''')
|
||||
re_author=re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
|
||||
re_rate=re.compile(r'''^<span aria-label=\"(.+?)\">$''')
|
||||
#re_end=re.compile(r'''<\/body><\/html>''')
|
||||
re_end=re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
@@ -171,8 +114,11 @@ class bookInfoSpide():
|
||||
"link":"https://....xxxxx"
|
||||
"bookname": "庆余年",
|
||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||
"rate": "8.0",
|
||||
"score": "8.0",
|
||||
"ratenum": "1000",
|
||||
"author": "猫腻"
|
||||
"publisher": "中华书局"
|
||||
"publishing": "2015"
|
||||
},...}
|
||||
"""
|
||||
|
||||
@@ -187,15 +133,8 @@ class bookInfoSpide():
|
||||
session.params = mparams
|
||||
r = session.get( url=murl, headers=mheaders, params=mparams)
|
||||
#r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||||
except requests.exceptions.ConnectionError:
|
||||
print('ConnectionError -- please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
except requests.exceptions.ChunkedEncodingError:
|
||||
print('ChunkedEncodingError -- please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
except:
|
||||
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
if r.status_code != 200:
|
||||
print('grab book {} info from webside failure'.format(mbkn))
|
||||
@@ -207,35 +146,50 @@ class bookInfoSpide():
|
||||
resp = r.text
|
||||
|
||||
if ISDOUBAN==1:
|
||||
stat = 'SID'
|
||||
stat='SID'
|
||||
for line in resp.split('\n'):
|
||||
line = line.strip()
|
||||
line=line.strip()
|
||||
if line=='': continue
|
||||
|
||||
if stat=='SID':
|
||||
ret=re.search(self.re_bn, line)
|
||||
if ret:
|
||||
sid = ret.group(1)
|
||||
bkinfo[sid]['link'] = os.path.join(LINKPREF,sid)
|
||||
bkinfo[sid]['bookname'] = ret.group(2)
|
||||
bkinfo[sid]['img'] = ret.group(3)
|
||||
stat = 'RATE'
|
||||
sid=ret.group(1)
|
||||
bkinfo[sid]['link']=os.path.join(LINKPREF,sid)
|
||||
bkinfo[sid]['bookname']=ret.group(2)
|
||||
bkinfo[sid]['img']=ret.group(3)
|
||||
stat='STAR'
|
||||
continue
|
||||
elif stat=='RATE':
|
||||
# if no rate, goto next bookname state
|
||||
if re.search(self.re_norate, line):
|
||||
stat = 'SID'
|
||||
continue
|
||||
ret=re.search(self.re_rate, line)
|
||||
elif stat=='STAR':
|
||||
ret=re.search(self.re_star, line)
|
||||
if ret:
|
||||
bkinfo[sid]['rate'] = ret.group(1)
|
||||
stat = 'AUTHOR'
|
||||
star = ret.group(1)
|
||||
if star=='00':
|
||||
stat='AUTHOR'
|
||||
elif int(star) > 0:
|
||||
stat='SCORE'
|
||||
elif stat=='SCORE':
|
||||
ret=re.search(self.re_score, line)
|
||||
if ret:
|
||||
bkinfo[sid]['score']=ret.group(1)
|
||||
stat='RATENUM'
|
||||
continue
|
||||
elif stat=='RATENUM':
|
||||
ret=re.search(self.re_ratenum, line)
|
||||
if ret:
|
||||
bkinfo[sid]['ratenum']=ret.group(1)
|
||||
stat='AUTHOR'
|
||||
continue
|
||||
elif stat=='AUTHOR':
|
||||
ret=re.search(self.re_author, line)
|
||||
if ret:
|
||||
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
|
||||
stat = 'SID'
|
||||
tt=ret.group(1).split(' / ')
|
||||
if len(tt)>=3:
|
||||
*author, bkinfo[sid]['publisher'], bkinfo[sid]['publishing']=tt
|
||||
bkinfo[sid]['author']='/'.join(author)
|
||||
else:
|
||||
bkinfo[sid]['author']=ret[0]
|
||||
stat='SID'
|
||||
else: continue
|
||||
else:
|
||||
stat='ASIN'
|
||||
@@ -288,13 +242,15 @@ class bookInfoSpide():
|
||||
"""
|
||||
mbkinfo:
|
||||
douban
|
||||
{
|
||||
"庆余年": {
|
||||
"link":"https://....25853071",
|
||||
"bookname": "庆余年xxx",
|
||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||
"rate": "8.0",
|
||||
"author": "猫腻"
|
||||
"10530219": {
|
||||
"link": "https://book.douban.com/subject/10530219",
|
||||
"bookname": "市场的逻辑",
|
||||
"img": "https://img3.doubanio.com/view/subject/s/public/s8912552.jpg",
|
||||
"score": "8.3",
|
||||
"ratenum": "218",
|
||||
"publisher": "世纪文景 上海人民出版社",
|
||||
"publishing": "2012",
|
||||
"author": "张维迎"
|
||||
},...}
|
||||
amazon
|
||||
"孟子": {
|
||||
@@ -310,21 +266,21 @@ class bookInfoSpide():
|
||||
# f1/d1: mbkn include in bookname
|
||||
# f2/d2: bookname include mbkn
|
||||
# f3/d3: mbkn and bookname different
|
||||
[f1,f2,f3] = [0,0,0]
|
||||
[d1,d2,d3] = [{},{},{}]
|
||||
mbkn = mbkinfo[0]
|
||||
[f1,f2,f3]=[0,0,0]
|
||||
[d1,d2,d3] =[{},{},{}]
|
||||
mbkn=mbkinfo[0]
|
||||
for k,v in mbkinfo[1].items():
|
||||
bkn = v['bookname']
|
||||
if len(v) == 5:
|
||||
bkn=v['bookname']
|
||||
if len(v)==8:
|
||||
if (not f1) and (mbkn in bkn):
|
||||
f1 = 1
|
||||
d1 = {mbkn:v}
|
||||
f1=1
|
||||
d1={mbkn:v}
|
||||
elif (not f1) and (not f2) and (bkn in mbkn):
|
||||
f2 = 1
|
||||
d2 = {mbkn:v}
|
||||
f2=1
|
||||
d2={mbkn:v}
|
||||
elif (not f3):
|
||||
f3 = 1
|
||||
d3 = {mbkn:v}
|
||||
f3=1
|
||||
d3={mbkn:v}
|
||||
else: continue
|
||||
else:
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user