343 lines
13 KiB
Python
343 lines
13 KiB
Python
|
||
#########################################################
|
||
## @file : parseweb.py
|
||
## @desc : douban and amazon book spide
|
||
## amazon ASIN B07VKS1DRZ - https://www.amazon.cn/s?k=B07VKS1DRZ
|
||
## douban SID 25742200 - https://book.douban.com/subject/25742200
|
||
## @create : 2020/6/14
|
||
## @author : Chengan
|
||
## @email : douboer@gmail.com
|
||
#########################################################
|
||
|
||
import requests
|
||
import json
|
||
import re
|
||
import os
|
||
import subprocess
|
||
import logging
|
||
from collections import defaultdict
|
||
|
||
# log info
|
||
logger = logging.getLogger()
|
||
logger.addHandler(logging.FileHandler('log'))
|
||
logger.setLevel(logging.DEBUG)
|
||
|
||
ISDOUBAN = 1
|
||
IMGPATH = './downimg'
|
||
LINKPREF = 'https://book.douban.com/subject/' \
|
||
if ISDOUBAN else 'https://www.amazon.cn/s?k='
|
||
|
||
mheaders = {
|
||
'Host': 'www.douban.com',
|
||
'Referer': 'http://www.douban.com',
|
||
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
|
||
}
|
||
#"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
|
||
|
||
mparams = {}
|
||
murl = ""
|
||
if ISDOUBAN==1:
|
||
mparams['Host']='www.douban.com',
|
||
mparams['search_text'] = 'bkname_xxx'
|
||
mparams['cat']='1001'
|
||
mparams['k']='bookname_xxx'
|
||
murl = "https://search.douban.com/book/subject_search"
|
||
else:
|
||
mheaders['Host']='www.amazon.cn'
|
||
mheaders['Referer']='http:/www.amazon.cn'
|
||
|
||
#https://www.amazon.cn/s?k={bookname}&i=stripbooks&__mk_zh_CN=亚马逊网站&ref=nb_sb_noss
|
||
mparams['Host']='www.amazon.cn'
|
||
mparams['k']='bkname_xxx'
|
||
mparams['i']='stripbooks'
|
||
mparams['__mk_zh_CN=']='亚马逊网站'
|
||
mparams['reg']='nb_sb_noss'
|
||
|
||
murl = 'https://www.amazon.cn/s'
|
||
|
||
testbooks =['24堂财富课',
|
||
'甲骨文',
|
||
'庆余年(精校版)',
|
||
'商君书-中华经典名著全本全注全译丛书',
|
||
'苏世民:我的经验与教训(2018读桥水达利欧的原则,2020看黑石苏世民的经验!一本书读懂从白手起家到华尔街新国王的传奇人生)',
|
||
'杨伯峻_论语译注',
|
||
'小窗幽记',
|
||
'少年凯歌',
|
||
'投资要义',
|
||
'白鱼解字',
|
||
'历史的巨镜',
|
||
'货币的教训',
|
||
'钱从哪里来',
|
||
'中国古代简史',
|
||
'罗马人的故事(套装共15册)',
|
||
'改变心理学的40项研究',
|
||
'如何假装懂音乐',
|
||
'管子(上下册)--中华经典名著全本全注全译(精)',
|
||
'投资中最简单的事',
|
||
'薛兆丰经济学讲义',
|
||
'枪炮、病菌与钢铁:人类社会的命运(世纪人文系列丛书·开放人文)',
|
||
'中央帝国的哲学密码',
|
||
'新编说文解字大全集(超值白金版)',
|
||
'市场的逻辑(增订本)',
|
||
'金融的本质:伯南克四讲美联储(看一个风云人物的金融思考)',
|
||
'从零开始学写作:个人增值的有效方法',
|
||
'中国国家治理的制度逻辑:一个组织学研究',
|
||
'中国为什么有前途:对外经济关系的战略潜能(第3版)',
|
||
'日本的世界观(《剑桥日本史》主编凝练之作三个人物故事串起日本两百年变局了解近代日本转向的必读之书理想国出品))']
|
||
|
||
class bookInfoSpide():
|
||
|
||
[re_bn,re_bn,re_score,re_star,re_author,re_description,re_end] \
|
||
=[None,None,None,None,None,None,None]
|
||
if ISDOUBAN==1:
|
||
re_bn=re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
||
re_bn=re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
||
re_star=re.compile(r'''^<span class=\"allstar(\d+)\"></span>''')
|
||
re_score=re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
||
re_ratenum=re.compile(r'''^<span>\((\d+)人评价\)</span>''')
|
||
re_author=re.compile(r'''class=\"subject-cast\">(.+?)<''')
|
||
re_description=re.compile(r'''^<p>(.+?)(<\/p>){0,1}$''')
|
||
else:
|
||
re_asin=re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
|
||
re_img=re.compile(r'''^<img src=\"(.+?)\"$''')
|
||
re_bn=re.compile(r'''^alt=\"(.+?)\"$''')
|
||
re_author=re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
|
||
re_rate=re.compile(r'''^<span aria-label=\"(.+?)\">$''')
|
||
#re_end=re.compile(r'''<\/body><\/html>''')
|
||
re_end=re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
|
||
|
||
def __init__(self):
|
||
pass
|
||
|
||
def grab_book_info(self, mbkn: str):
|
||
"""mbkn - bookname to be spided
|
||
return: {
|
||
"25853071": { # sid
|
||
"link":"https://....xxxxx"
|
||
"bookname": "庆余年",
|
||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||
"score": "8.0",
|
||
"ratenum": "1000",
|
||
"author": "猫腻"
|
||
"publisher": "中华书局"
|
||
"description": "XXX"
|
||
"publishing": "2015"
|
||
},...}
|
||
"""
|
||
|
||
if ISDOUBAN==1: #douban
|
||
mparams['search_text'] = mbkn
|
||
else: #amazon
|
||
mparams['k'] = mbkn
|
||
|
||
try:
|
||
session = requests.Session()
|
||
session.header = mheaders
|
||
session.params = mparams
|
||
r = session.get( url=murl, headers=mheaders, params=mparams)
|
||
#r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||
except Exception as e:
|
||
print(e)
|
||
|
||
if r.status_code != 200:
|
||
print('grab book {} info from webside failure'.format(mbkn))
|
||
|
||
bkinfo = defaultdict(dict)
|
||
sid = None
|
||
stat = None
|
||
|
||
resp = r.text
|
||
|
||
if ISDOUBAN==1:
|
||
stat='SID'
|
||
for line in resp.split('\n'):
|
||
line=line.strip()
|
||
if line=='': continue
|
||
|
||
if stat=='SID':
|
||
ret=re.search(self.re_bn, line)
|
||
if ret:
|
||
sid=ret.group(1)
|
||
bkinfo[sid]['link']=os.path.join(LINKPREF,sid)
|
||
bkinfo[sid]['bookname']=ret.group(2)
|
||
bkinfo[sid]['img']=ret.group(3)
|
||
stat='STAR'
|
||
continue
|
||
elif stat=='STAR':
|
||
ret=re.search(self.re_star, line)
|
||
if ret:
|
||
star = ret.group(1)
|
||
if star=='00':
|
||
stat='AUTHOR'
|
||
elif int(star) > 0:
|
||
stat='SCORE'
|
||
elif stat=='SCORE':
|
||
ret=re.search(self.re_score, line)
|
||
if ret:
|
||
bkinfo[sid]['score']=ret.group(1)
|
||
stat='RATENUM'
|
||
continue
|
||
elif stat=='RATENUM':
|
||
ret=re.search(self.re_ratenum, line)
|
||
if ret:
|
||
bkinfo[sid]['ratenum']=ret.group(1)
|
||
stat='AUTHOR'
|
||
continue
|
||
elif stat=='AUTHOR':
|
||
ret=re.search(self.re_author, line)
|
||
if ret:
|
||
tt=ret.group(1).split(' / ')
|
||
if len(tt)>=3:
|
||
*author, bkinfo[sid]['publisher'], bkinfo[sid]['publishing']=tt
|
||
bkinfo[sid]['author']='/'.join(author)
|
||
else:
|
||
bkinfo[sid]['author']=ret[0]
|
||
stat='DESCRIPTION'
|
||
continue
|
||
elif stat=='DESCRIPTION':
|
||
ret=re.search(self.re_description, line)
|
||
if ret:
|
||
bkinfo[sid]['description']=ret.group(1).strip()
|
||
stat='SID'
|
||
continue
|
||
else: continue
|
||
else:
|
||
stat='ASIN'
|
||
for line in resp.split('\n'):
|
||
line = line.strip()
|
||
if line=='': continue
|
||
|
||
if stat=='ASIN':
|
||
ret=re.search(self.re_asin, line)
|
||
if ret:
|
||
sid=ret.group(1)
|
||
bkinfo[sid]['link'] = os.path.join(LINKPREF,ret.group(1))
|
||
stat = 'IMG'
|
||
continue
|
||
elif stat=='IMG':
|
||
ret=re.search(self.re_img, line)
|
||
if ret:
|
||
bkinfo[sid]['img'] = ret.group(1)
|
||
stat = 'BOOKNAME'
|
||
continue
|
||
elif stat=='BOOKNAME':
|
||
ret=re.search(self.re_bn, line)
|
||
if ret:
|
||
bkname = re.split(r'[((\s]',ret.group(1).strip())[0]
|
||
bkinfo[sid]['bookname'] = bkname
|
||
stat = 'AUTHOR'
|
||
continue
|
||
elif stat=='AUTHOR':
|
||
ret=re.search(self.re_author, line)
|
||
if ret:
|
||
author = ','.join(re.split('<span.+?auto\">|<\/span', ret.group(0))[3::4])
|
||
bkinfo[sid]['author'] = author
|
||
stat = 'RATE'
|
||
continue
|
||
elif stat=='RATE':
|
||
ret=re.search(self.re_rate, line)
|
||
if ret:
|
||
bkinfo[sid]['rate'] = ret.group(1).split(' ')[0]
|
||
stat = 'AUTHOR'
|
||
continue
|
||
else: continue
|
||
|
||
if re.search(self.re_end, line):
|
||
stat=='ASIN'
|
||
continue
|
||
|
||
return [mbkn, bkinfo]
|
||
|
||
def filter_spide_book(self, mbkinfo):
|
||
"""
|
||
mbkinfo:
|
||
douban
|
||
"10530219": {
|
||
"link": "https://book.douban.com/subject/10530219",
|
||
"bookname": "市场的逻辑",
|
||
"img": "https://img3.doubanio.com/view/subject/s/public/s8912552.jpg",
|
||
"score": "8.3",
|
||
"ratenum": "218",
|
||
"publisher": "世纪文景 上海人民出版社",
|
||
"publishing": "2012",
|
||
"author": "张维迎"
|
||
},...}
|
||
amazon
|
||
"孟子": {
|
||
"link": "https://....B07RN73425",
|
||
"bookname": "古典名著普及文库:孟子",
|
||
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
|
||
"rate": "3.9"
|
||
"author": "孙钦善",
|
||
}
|
||
|
||
"""
|
||
#booklink - https://book.douban.com/subject/{sid}
|
||
# f1/d1: mbkn include in bookname
|
||
# f2/d2: bookname include mbkn
|
||
# f3/d3: mbkn and bookname different
|
||
[f1,f2,f3]=[0,0,0]
|
||
[d1,d2,d3] =[{},{},{}]
|
||
mbkn=mbkinfo[0]
|
||
for k,v in mbkinfo[1].items():
|
||
bkn=v['bookname']
|
||
#print('xxxx bkn {} {} len {}'.format(bkn, v, len(v)))
|
||
if len(v)==9:
|
||
if (not f1) and (mbkn in bkn):
|
||
f1=1
|
||
d1={mbkn:v}
|
||
elif (not f1) and (not f2) and (bkn in mbkn):
|
||
f2=1
|
||
d2={mbkn:v}
|
||
elif (not f3):
|
||
f3=1
|
||
d3={mbkn:v}
|
||
else: continue
|
||
else:
|
||
continue
|
||
|
||
if f1:
|
||
return d1
|
||
elif f2:
|
||
return d2
|
||
elif f3:
|
||
return d3
|
||
|
||
return None
|
||
|
||
def down_book_img(self, mbkinfo):
|
||
import os
|
||
import socket
|
||
from urllib.request import urlretrieve
|
||
|
||
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
|
||
|
||
for k,v in mbkinfo.items():
|
||
link = v['img']
|
||
if not os.path.exists(IMGPATH): os.mkdir(IMGPATH)
|
||
p=os.path.join(IMGPATH,link.split('/')[-1])
|
||
|
||
try:
|
||
img = requests.get(link, headers=headers)
|
||
if img.status_code == 200:
|
||
with open(p, 'wb') as fp:
|
||
fp.write(img.content)
|
||
except Exception as e:
|
||
print(e)
|
||
|
||
if __name__ == '__main__':
|
||
|
||
spide = bookInfoSpide()
|
||
|
||
for bkname in testbooks:
|
||
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
|
||
print(bkname)
|
||
bkinfo = spide.grab_book_info(bkname)
|
||
filter_bkinfo = spide.filter_spide_book(bkinfo)
|
||
if filter_bkinfo: spide.down_book_img(filter_bkinfo)
|
||
|
||
logger.debug('================ {} ================'.format(bkname))
|
||
logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
|
||
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))
|
||
|