kindle manager

This commit is contained in:
gavin
2020-06-14 21:33:42 +08:00
parent 9d5b787ff3
commit 7cb8e61519
36 changed files with 99 additions and 28 deletions

View File

@@ -22,17 +22,21 @@ logger = logging.getLogger()
logger.addHandler(logging.FileHandler('log'))
logger.setLevel(logging.DEBUG)
spidetp = 1 # 0 - douban 1- amazon
ISDOUBAN = 1
IMGPATH = './downimg'
LINKPREF = 'https://book.douban.com/subject/' \
if ISDOUBAN else 'https://www.amazon.cn/s?k='
mheaders = {
'Host': 'www.douban.com',
'Referer': 'http://www.douban.com',
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
#"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
mparams = {}
murl = ""
if spidetp==0:
if ISDOUBAN==1:
mparams['Host']='www.douban.com',
mparams['search_text'] = 'bkname_xxx'
mparams['cat']='1001'
@@ -141,7 +145,7 @@ class bookInfoSpide():
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
if spidetp==0:
if ISDOUBAN==1:
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
@@ -164,6 +168,7 @@ class bookInfoSpide():
"""mbkn - bookname to be spided
return: {
"25853071": { # sid
"link":"https://....xxxxx"
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
@@ -171,15 +176,32 @@ class bookInfoSpide():
},...}
"""
if spidetp==0: #douban
if ISDOUBAN==1: #douban
mparams['search_text'] = mbkn
else: #amazon
mparams['k'] = mbkn
r = requests.get( url=murl, headers=mheaders, params=mparams)
try:
s = requests.Session()
s.header = mheaders
s.params = mparams
r = s.get(murl)
#r = requests.get( url=murl, headers=mheaders, params=mparams)
except requests.exceptions.ConnectionError:
print('ConnectionError -- please wait 3 seconds')
time.sleep(3)
except requests.exceptions.ChunkedEncodingError:
print('ChunkedEncodingError -- please wait 3 seconds')
time.sleep(3)
except:
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
time.sleep(3)
if r.status_code != 200:
raise Exception("请求失败")
print('grab book {} info from webside failure'.format(mbkn))
bkinfo = defaultdict(dict)
sid = None
@@ -187,7 +209,7 @@ class bookInfoSpide():
resp = r.text
if spidetp==0:
if ISDOUBAN==1:
stat = 'SID'
for line in resp.split('\n'):
line = line.strip()
@@ -197,6 +219,7 @@ class bookInfoSpide():
ret=re.search(self.re_bn, line)
if ret:
sid = ret.group(1)
bkinfo[sid]['link'] = os.path.join(LINKPREF,sid)
bkinfo[sid]['bookname'] = ret.group(2)
bkinfo[sid]['img'] = ret.group(3)
stat = 'RATE'
@@ -226,7 +249,8 @@ class bookInfoSpide():
if stat=='ASIN':
ret=re.search(self.re_asin, line)
if ret:
sid = ret.group(1)
sid=ret.group(1)
bkinfo[sid]['link'] = os.path.join(LINKPREF,ret.group(1))
stat = 'IMG'
continue
elif stat=='IMG':
@@ -261,25 +285,27 @@ class bookInfoSpide():
stat=='ASIN'
continue
return bkinfo
return [mbkn, bkinfo]
def filter_spide_books(self, mbkn, mbkinfo):
""" mbkn - bookname to be spide
def filter_spide_book(self, mbkinfo):
"""
mbkinfo:
douban
{
"25853071": { # sid
"bookname": "庆余年",
"庆余年": {
"link":"https://....25853071",
"bookname": "庆余年xxx",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
},...}
amazon
"B07RN73425": {
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
"孟子": {
"link": "https://....B07RN73425",
"bookname": "古典名著普及文库:孟子",
"author": "孙钦善",
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
"rate": "3.9"
"author": "孙钦善",
}
"""
@@ -289,18 +315,19 @@ class bookInfoSpide():
# f3/d3: mbkn and bookname different
[f1,f2,f3] = [0,0,0]
[d1,d2,d3] = [{},{},{}]
for k,v in mbkinfo.items():
mbkn = mbkinfo[0]
for k,v in mbkinfo[1].items():
bkn = v['bookname']
if len(v) == 4:
if len(v) == 5:
if (not f1) and (mbkn in bkn):
f1 = 1
d1 = {k:v}
d1 = {mbkn:v}
elif (not f1) and (not f2) and (bkn in mbkn):
f2 = 1
d2 = {k:v}
d2 = {mbkn:v}
elif (not f3):
f3 = 1
d3 = {k:v}
d3 = {mbkn:v}
else: continue
else:
continue
@@ -312,7 +339,28 @@ class bookInfoSpide():
elif f3:
return d3
return 0
return None
def down_book_img(self, mbkinfo):
import os
import socket
from urllib.request import urlretrieve
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
for k,v in mbkinfo.items():
link = v['img']
if not os.path.exists(IMGPATH): os.mkdir(IMGPATH)
p=os.path.join(IMGPATH,link.split('/')[-1])
try:
img = requests.get(link, headers=headers)
if img.status_code == 200:
with open(p, 'wb') as fp:
fp.write(img.content)
except Exception as e:
print(e)
if __name__ == '__main__':
@@ -322,9 +370,10 @@ if __name__ == '__main__':
bkname = re.split(r'[\(\-\:_\s]',bkname.strip())[0]
print(bkname)
bkinfo = spide.grab_book_info(bkname)
filter_bkinfo = spide.filter_spide_books(bkname, bkinfo)
filter_bkinfo = spide.filter_spide_book(bkinfo)
if filter_bkinfo: spide.down_book_img(filter_bkinfo)
logger.debug('================ {} ================'.format(bkname))
#logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))