kindle manager
This commit is contained in:
99
parseweb.py
99
parseweb.py
@@ -22,17 +22,21 @@ logger = logging.getLogger()
|
||||
logger.addHandler(logging.FileHandler('log'))
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
spidetp = 1 # 0 - douban 1- amazon
|
||||
ISDOUBAN = 1
|
||||
IMGPATH = './downimg'
|
||||
LINKPREF = 'https://book.douban.com/subject/' \
|
||||
if ISDOUBAN else 'https://www.amazon.cn/s?k='
|
||||
|
||||
mheaders = {
|
||||
'Host': 'www.douban.com',
|
||||
'Referer': 'http://www.douban.com',
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
|
||||
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
|
||||
}
|
||||
#"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
|
||||
|
||||
mparams = {}
|
||||
murl = ""
|
||||
if spidetp==0:
|
||||
if ISDOUBAN==1:
|
||||
mparams['Host']='www.douban.com',
|
||||
mparams['search_text'] = 'bkname_xxx'
|
||||
mparams['cat']='1001'
|
||||
@@ -141,7 +145,7 @@ class bookInfoSpide():
|
||||
|
||||
|
||||
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
|
||||
if spidetp==0:
|
||||
if ISDOUBAN==1:
|
||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
||||
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
||||
@@ -164,6 +168,7 @@ class bookInfoSpide():
|
||||
"""mbkn - bookname to be spided
|
||||
return: {
|
||||
"25853071": { # sid
|
||||
"link":"https://....xxxxx"
|
||||
"bookname": "庆余年",
|
||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||
"rate": "8.0",
|
||||
@@ -171,15 +176,32 @@ class bookInfoSpide():
|
||||
},...}
|
||||
"""
|
||||
|
||||
if spidetp==0: #douban
|
||||
if ISDOUBAN==1: #douban
|
||||
mparams['search_text'] = mbkn
|
||||
else: #amazon
|
||||
mparams['k'] = mbkn
|
||||
|
||||
r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||||
try:
|
||||
s = requests.Session()
|
||||
s.header = mheaders
|
||||
s.params = mparams
|
||||
r = s.get(murl)
|
||||
#r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
print('ConnectionError -- please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
|
||||
except requests.exceptions.ChunkedEncodingError:
|
||||
print('ChunkedEncodingError -- please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
|
||||
except:
|
||||
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
|
||||
if r.status_code != 200:
|
||||
raise Exception("请求失败")
|
||||
print('grab book {} info from webside failure'.format(mbkn))
|
||||
|
||||
bkinfo = defaultdict(dict)
|
||||
sid = None
|
||||
@@ -187,7 +209,7 @@ class bookInfoSpide():
|
||||
|
||||
resp = r.text
|
||||
|
||||
if spidetp==0:
|
||||
if ISDOUBAN==1:
|
||||
stat = 'SID'
|
||||
for line in resp.split('\n'):
|
||||
line = line.strip()
|
||||
@@ -197,6 +219,7 @@ class bookInfoSpide():
|
||||
ret=re.search(self.re_bn, line)
|
||||
if ret:
|
||||
sid = ret.group(1)
|
||||
bkinfo[sid]['link'] = os.path.join(LINKPREF,sid)
|
||||
bkinfo[sid]['bookname'] = ret.group(2)
|
||||
bkinfo[sid]['img'] = ret.group(3)
|
||||
stat = 'RATE'
|
||||
@@ -226,7 +249,8 @@ class bookInfoSpide():
|
||||
if stat=='ASIN':
|
||||
ret=re.search(self.re_asin, line)
|
||||
if ret:
|
||||
sid = ret.group(1)
|
||||
sid=ret.group(1)
|
||||
bkinfo[sid]['link'] = os.path.join(LINKPREF,ret.group(1))
|
||||
stat = 'IMG'
|
||||
continue
|
||||
elif stat=='IMG':
|
||||
@@ -261,25 +285,27 @@ class bookInfoSpide():
|
||||
stat=='ASIN'
|
||||
continue
|
||||
|
||||
return bkinfo
|
||||
return [mbkn, bkinfo]
|
||||
|
||||
def filter_spide_books(self, mbkn, mbkinfo):
|
||||
""" mbkn - bookname to be spide
|
||||
def filter_spide_book(self, mbkinfo):
|
||||
"""
|
||||
mbkinfo:
|
||||
douban
|
||||
{
|
||||
"25853071": { # sid
|
||||
"bookname": "庆余年",
|
||||
"庆余年": {
|
||||
"link":"https://....25853071",
|
||||
"bookname": "庆余年xxx",
|
||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||
"rate": "8.0",
|
||||
"author": "猫腻"
|
||||
},...}
|
||||
amazon
|
||||
"B07RN73425": {
|
||||
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
|
||||
"孟子": {
|
||||
"link": "https://....B07RN73425",
|
||||
"bookname": "古典名著普及文库:孟子",
|
||||
"author": "孙钦善",
|
||||
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
|
||||
"rate": "3.9"
|
||||
"author": "孙钦善",
|
||||
}
|
||||
|
||||
"""
|
||||
@@ -289,18 +315,19 @@ class bookInfoSpide():
|
||||
# f3/d3: mbkn and bookname different
|
||||
[f1,f2,f3] = [0,0,0]
|
||||
[d1,d2,d3] = [{},{},{}]
|
||||
for k,v in mbkinfo.items():
|
||||
mbkn = mbkinfo[0]
|
||||
for k,v in mbkinfo[1].items():
|
||||
bkn = v['bookname']
|
||||
if len(v) == 4:
|
||||
if len(v) == 5:
|
||||
if (not f1) and (mbkn in bkn):
|
||||
f1 = 1
|
||||
d1 = {k:v}
|
||||
d1 = {mbkn:v}
|
||||
elif (not f1) and (not f2) and (bkn in mbkn):
|
||||
f2 = 1
|
||||
d2 = {k:v}
|
||||
d2 = {mbkn:v}
|
||||
elif (not f3):
|
||||
f3 = 1
|
||||
d3 = {k:v}
|
||||
d3 = {mbkn:v}
|
||||
else: continue
|
||||
else:
|
||||
continue
|
||||
@@ -312,7 +339,28 @@ class bookInfoSpide():
|
||||
elif f3:
|
||||
return d3
|
||||
|
||||
return 0
|
||||
return None
|
||||
|
||||
def down_book_img(self, mbkinfo):
|
||||
import os
|
||||
import socket
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
|
||||
|
||||
for k,v in mbkinfo.items():
|
||||
link = v['img']
|
||||
if not os.path.exists(IMGPATH): os.mkdir(IMGPATH)
|
||||
p=os.path.join(IMGPATH,link.split('/')[-1])
|
||||
|
||||
try:
|
||||
img = requests.get(link, headers=headers)
|
||||
if img.status_code == 200:
|
||||
with open(p, 'wb') as fp:
|
||||
fp.write(img.content)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -322,9 +370,10 @@ if __name__ == '__main__':
|
||||
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
|
||||
print(bkname)
|
||||
bkinfo = spide.grab_book_info(bkname)
|
||||
filter_bkinfo = spide.filter_spide_books(bkname, bkinfo)
|
||||
filter_bkinfo = spide.filter_spide_book(bkinfo)
|
||||
if filter_bkinfo: spide.down_book_img(filter_bkinfo)
|
||||
|
||||
logger.debug('================ {} ================'.format(bkname))
|
||||
#logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
|
||||
logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
|
||||
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user