kindle manager

This commit is contained in:
gavin
2020-06-14 13:57:58 +08:00
parent 6423711484
commit 37294e9da2
7 changed files with 5321 additions and 3123 deletions

3077
debug

File diff suppressed because it is too large Load Diff

1957
export.md

File diff suppressed because it is too large Load Diff

276
tbook.py Normal file
View File

@@ -0,0 +1,276 @@
#########################################################
## @file : tbook.py
## @desc : douban book spide
## @create : 2020/6/13
## @author : Chengan
## @email : douboer@gmail.com
#########################################################
import requests
import json
import re
import os
import subprocess
import logging
from collections import defaultdict
# log info
logger = logging.getLogger()
logger.addHandler(logging.FileHandler('log'))
logger.setLevel(logging.DEBUG)
spidetp = 1 # 0 - douban 1- amazon
mheaders = {
'Host': 'www.douban.com',
'Referer': 'http://www.douban.com',
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
}
mparams = {}
murl = ""
if spidetp==0:
mparams['Host']='www.douban.com',
mparams['search_text'] = 'bkname_xxx'
mparams['cat']='1001'
mparams['k']='bookname_xxx'
murl = "https://search.douban.com/book/subject_search"
else:
mheaders['Host']='www.amazon.cn'
mheaders['Referer']='http:/www.amazon.cn'
#https://www.amazon.cn/s?k={bookname}&i=stripbooks&__mk_zh_CN=亚马逊网站&ref=nb_sb_noss
mparams['Host']='www.amazon.cn'
mparams['k']='bkname_xxx'
mparams['i']='stripbooks'
mparams['__mk_zh_CN=']='亚马逊网站'
mparams['reg']='nb_sb_noss'
murl = 'https://www.amazon.cn/s'
testbooks =['24堂财富课',
'甲骨文',
'庆余年(精校版)',
'商君书-中华经典名著全本全注全译丛书',
'苏世民:我的经验与教训2018读桥水达利欧的原则2020看黑石苏世民的经验!一本书读懂从白手起家到华尔街新国王的传奇人生)',
'杨伯峻_论语译注',
'小窗幽记',
'少年凯歌',
'投资要义',
'白鱼解字',
'历史的巨镜',
'货币的教训',
'钱从哪里来',
'中国古代简史',
'罗马人的故事(套装共15册)',
'改变心理学的40项研究',
'如何假装懂音乐',
'管子(上下册)--中华经典名著全本全注全译(精)',
'投资中最简单的事',
'薛兆丰经济学讲义',
'枪炮、病菌与钢铁:人类社会的命运(世纪人文系列丛书·开放人文)',
'中央帝国的哲学密码',
'新编说文解字大全集(超值白金版)',
'市场的逻辑(增订本)',
'金融的本质:伯南克四讲美联储(看一个风云人物的金融思考)',
'从零开始学写作:个人增值的有效方法',
'中国国家治理的制度逻辑:一个组织学研究',
'中国为什么有前途对外经济关系的战略潜能第3版',
'日本的世界观(《剑桥日本史》主编凝练之作三个人物故事串起日本两百年变局了解近代日本转向的必读之书理想国出品))']
testresp = """<div class="result">
<div class="pic">
<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&amp;query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&amp;cat_id=1001&amp;type=search&amp;pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" title="解读中国经济" ><img src="https://img3.doubanio.com/view/subject/s/public/s29872890.jpg"></a>
</div>
<div class="content">
<div class="title">
<h3>
<span>[书籍]</span>&nbsp;<a href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&amp;query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&amp;cat_id=1001&amp;type=search&amp;pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" >解读中国经济 </a>
</h3>
<div class="rating-info">
<span class="allstar45"></span>
<span class="rating_nums">9.4</span>
<span>(145人评价)</span>
<span class="subject-cast">林毅夫 / 北京大学出版社 / 2018</span>
</div>
</div>
<p>《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...</p>
</div>
</div>
div class="result">
<div class="pic">
<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&amp;query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&amp;cat_id=1001&amp;type=search&amp;pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" title="解读中国经济" ><img src="https://img3.doubanio.com/view/subject/s/public/s29872890.jpg"></a>
</div>
<div class="content">
<div class="title">
<h3>
<span>[书籍]</span>&nbsp;<a href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&amp;query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&amp;cat_id=1001&amp;type=search&amp;pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" >解读中国经济 </a>
</h3>
<div class="rating-info">
<span class="allstar45"></span>
<span class="rating_nums">9.4</span>
<span>(145人评价)</span>
<span class="subject-cast">林毅夫 / 北京大学出版社 / 2018</span>
</div>
</div>
<p>《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...</p>
</div>
</div>"""
class doubanSpide():
"""
re_bn = re.compile(r'''
class=\"nbg.+?sid: (\d+?),.+?
title=\"(.+?)\".+?
img src=\"(.+?)\".+?
rating_nums\">(.+?)<
''',flags=re.S|re.X)
"""
[re_bn,re_bn,re_rate,re_norate,re_author] = [None,None,None,None,None]
if spidetp==0:
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
re_norate = re.compile(r'''class=\"allstar00\"''')
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
else:
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
# author by split
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
re_norate = re.compile(r'''class=\"allstar00\"''')
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
def __init__(self):
pass
def parse_books(self, mbkn: str):
"""mbkn - bookname to be spided
return: {
"25853071": { # sid
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
},...}
"""
if spidetp==0: #douban
mparams['search_text'] = mbkn
else: #amazon
mparams['k'] = mbkn
r = requests.get( url=murl, headers=mheaders, params=mparams)
if r.status_code != 200:
raise Exception("请求失败")
bkinfo = defaultdict(dict)
sid = None
stat = 'NAME'
resp = r.text
if spidetp==0:
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
if stat=='NAME':
ret=re.search(self.re_bn, line)
if ret:
sid = ret.group(1)
bkinfo[sid]['bookname'] = ret.group(2)
bkinfo[sid]['img'] = ret.group(3)
stat = 'RATE'
continue
elif stat=='RATE':
# if no rate, goto next bookname state
if re.search(self.re_norate, line):
stat = 'NAME'
continue
ret=re.search(self.re_rate, line)
if ret:
bkinfo[sid]['rate'] = ret.group(1)
stat = 'AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
stat = 'NAME'
continue
else: continue
else:
logger.debug('================ {} ================'.format(mbkn))
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
logger.debug(line)
return bkinfo
def filter_spide_books(self, mbkn, mbkinfo):
""" mbkn - bookname to be spide
mbkinfo: {
"25853071": { # sid
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
},...}
"""
#booklink - https://book.douban.com/subject/{sid}
[f1,f2,f3] = [0,0,0]
[d1,d2,d3] = [{},{},{}]
for k,v in mbkinfo.items():
bkn = v['bookname']
if len(v) == 4:
if (not f1) and (mbkn in bkn):
f1 = 1
d1 = {k:v}
elif (not f1) and (not f2) and (bkn in mbkn):
f2 = 1
d2 = {k:v}
elif (not f3):
f3 = 1
d3 = {k:v}
else: continue
else:
continue
if f1:
return d1
elif f2:
return d2
elif f3:
return d3
return 0
if __name__ == '__main__':
spide = doubanSpide()
for bkname in testbooks:
bkname = re.split(r'[\(\-\:_\s]',bkname.strip())[0]
print(bkname)
bkinfo = spide.parse_books(bkname)
filter_bkinfo = spide.filter_spide_books(bkname, bkinfo)
logger.debug('================ {} ================'.format(bkname))
#logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))

19
tparseamazon.py Normal file
View File

@@ -0,0 +1,19 @@
import re
import json
s =['''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">丹·琼斯(Dan Jones)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">杰弗里·瓦夫罗(Geoffrey Wawro)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">克里斯托弗·希伯特(Christopher Hibbert)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">罗斯·金(Ross King)</span><span class="a-size-base" dir="auto">等等。</span></div>''',
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">马克·哈里斯</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">黎绮妮</span></div>''',
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">马克·哈里斯</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">黎绮妮</span></div>''',
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[美]威廉·厄本(William Urban)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">陆大鹏</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">刘晓晖</span></div>''',
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[英]安德鲁·罗伯茨(Andrew Roberts)</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">苏然</span></div>''',
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[英]安德鲁·罗伯茨(Andrew Roberts)</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">苏然</span></div>''',
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">堀田江理(Eri Hotta)</span></div>''',
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">景跃进</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">张小劲</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">余逊达</span></div>''']
for t in s:
ret = re.split('<span.+?auto\">|<\/span',t)
fret = ret[3::4]
#print(json.dumps(re.split('<span.+?auto\">|<\/span',t), indent=2, ensure_ascii=False))
print(','.join(fret))

Binary file not shown.

2997
x

File diff suppressed because one or more lines are too long

118
xx Normal file
View File

@@ -0,0 +1,118 @@
[
"<div class=\"a-row a-size-base a-color-secondary\">",
"",
">",
"丹·琼斯(Dan Jones)",
">",
", ",
">",
"杰弗里·瓦夫罗(Geoffrey Wawro)",
">",
", ",
">",
"克里斯托弗·希伯特(Christopher Hibbert)",
">",
", ",
">",
"罗斯·金(Ross King)",
">",
"等等。",
"></div>"
]
[
"<div class=\"a-row a-size-base a-color-secondary\">",
"",
">",
"马克·哈里斯",
">",
"、 ",
">",
"黎绮妮",
"></div>"
]
[
"<div class=\"a-row a-size-base a-color-secondary\">",
"",
">",
"马克·哈里斯",
">",
"、 ",
">",
"黎绮妮",
"></div>"
]
[
"<div class=\"a-row a-size-base a-color-secondary\">",
"",
">",
"[美]威廉·厄本(William Urban)",
">",
", ",
">",
"陆大鹏",
">",
"、 ",
">",
"刘晓晖",
"></div>"
]
[
"<div class=\"a-row a-size-base a-color-secondary\">",
"",
">",
"[英]安德鲁·罗伯茨(Andrew Roberts)",
">",
"、 ",
">",
"苏然",
"></div>"
]
[
"<div class=\"a-row a-size-base a-color-secondary\">",
"",
">",
"[英]安德鲁·罗伯茨(Andrew Roberts)",
">",
"、 ",
">",
"苏然",
"></div>"
]
[
"<div class=\"a-row a-size-base a-color-secondary\">",
"",
">",
"堀田江理(Eri Hotta)",
"></div>"
]
[
"<div class=\"a-row a-size-base a-color-secondary\">",
"",
">",
"景跃进",
">",
", ",
">",
"张小劲",
">",
"、 ",
">",
"余逊达",
"></div>"
]