Files
kman/parseweb.py
2020-06-30 08:31:21 +08:00

347 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#########################################################
## @file : parseweb.py
## @desc : douban and amazon book spide
## amazon ASIN B07VKS1DRZ - https://www.amazon.cn/s?k=B07VKS1DRZ
## douban SID 25742200 - https://book.douban.com/subject/25742200
## @create : 2020/6/14
## @author : Chengan
## @email : douboer@gmail.com
#########################################################
import requests
import json
import re
import os
import subprocess
import logging
from collections import defaultdict
# log info
logger = logging.getLogger()
logger.addHandler(logging.FileHandler('log'))
logger.setLevel(logging.DEBUG)
ISDOUBAN = 1
IMGPATH = './downimg'
LINKPREF = 'https://book.douban.com/subject/' \
if ISDOUBAN else 'https://www.amazon.cn/s?k='
mheaders = {
'Host': 'www.douban.com',
'Referer': 'http://www.douban.com',
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
#"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
mparams = {}
murl = ""
if ISDOUBAN==1:
mparams['Host']='www.douban.com',
mparams['search_text'] = 'bkname_xxx'
mparams['cat']='1001'
mparams['k']='bookname_xxx'
murl = "https://search.douban.com/book/subject_search"
else:
mheaders['Host']='www.amazon.cn'
mheaders['Referer']='http:/www.amazon.cn'
#https://www.amazon.cn/s?k={bookname}&i=stripbooks&__mk_zh_CN=亚马逊网站&ref=nb_sb_noss
mparams['Host']='www.amazon.cn'
mparams['k']='bkname_xxx'
mparams['i']='stripbooks'
mparams['__mk_zh_CN=']='亚马逊网站'
mparams['reg']='nb_sb_noss'
murl = 'https://www.amazon.cn/s'
testbooks =['24堂财富课',
'甲骨文',
'庆余年(精校版)',
'商君书-中华经典名著全本全注全译丛书',
'苏世民:我的经验与教训2018读桥水达利欧的原则2020看黑石苏世民的经验!一本书读懂从白手起家到华尔街新国王的传奇人生)',
'杨伯峻_论语译注',
'小窗幽记',
'少年凯歌',
'投资要义',
'白鱼解字',
'历史的巨镜',
'货币的教训',
'钱从哪里来',
'中国古代简史',
'罗马人的故事(套装共15册)',
'改变心理学的40项研究',
'如何假装懂音乐',
'管子(上下册)--中华经典名著全本全注全译(精)',
'投资中最简单的事',
'薛兆丰经济学讲义',
'枪炮、病菌与钢铁:人类社会的命运(世纪人文系列丛书·开放人文)',
'中央帝国的哲学密码',
'新编说文解字大全集(超值白金版)',
'市场的逻辑(增订本)',
'金融的本质:伯南克四讲美联储(看一个风云人物的金融思考)',
'从零开始学写作:个人增值的有效方法',
'中国国家治理的制度逻辑:一个组织学研究',
'中国为什么有前途对外经济关系的战略潜能第3版',
'日本的世界观(《剑桥日本史》主编凝练之作三个人物故事串起日本两百年变局了解近代日本转向的必读之书理想国出品))']
class bookInfoSpide():
[re_bn,re_bn,re_score,re_star,re_author,re_description,re_end] \
=[None,None,None,None,None,None,None]
if ISDOUBAN==1:
re_bn=re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
re_bn=re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
re_star=re.compile(r'''^<span class=\"allstar(\d+)\"></span>''')
re_score=re.compile(r'''class=\"rating_nums\">(.+?)<''')
re_ratenum=re.compile(r'''^<span>\((\d+)人评价\)</span>''')
re_author=re.compile(r'''class=\"subject-cast\">(.+?)<''')
re_description=re.compile(r'''^<p>(.+?)(<\/p>){0,1}$''')
else:
re_asin=re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
re_img=re.compile(r'''^<img src=\"(.+?)\"$''')
re_bn=re.compile(r'''^alt=\"(.+?)\"$''')
re_author=re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
re_rate=re.compile(r'''^<span aria-label=\"(.+?)\">$''')
#re_end=re.compile(r'''<\/body><\/html>''')
re_end=re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
def __init__(self):
pass
def grab_book_info(self, mbkn: str):
"""mbkn - bookname to be spided
return: {
"25853071": { # sid
"link":"https://....xxxxx"
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"score": "8.0",
"ratenum": "1000",
"author": "猫腻"
"publisher": "中华书局"
"description": "XXX"
"publishing": "2015"
},...}
"""
if ISDOUBAN==1: #douban
mparams['search_text'] = mbkn
else: #amazon
mparams['k'] = mbkn
try:
session = requests.Session()
session.header = mheaders
session.params = mparams
r = session.get( url=murl, headers=mheaders, params=mparams)
#r = requests.get( url=murl, headers=mheaders, params=mparams)
except Exception as e:
print(e)
if r.status_code != 200:
print('grab book {} info from webside failure'.format(mbkn))
bkinfo = defaultdict(dict)
sid = None
stat = None
resp = r.text
if ISDOUBAN==1:
stat='SID'
for line in resp.split('\n'):
line=line.strip()
if line=='': continue
if stat=='SID':
ret=re.search(self.re_bn, line)
if ret:
sid=ret.group(1)
bkinfo[sid]['link']=os.path.join(LINKPREF,sid)
bkinfo[sid]['bookname']=ret.group(2)
bkinfo[sid]['img']=ret.group(3)
stat='STAR'
continue
elif stat=='STAR':
ret=re.search(self.re_star, line)
if ret:
star = ret.group(1)
if star=='00':
stat='AUTHOR'
elif int(star) > 0:
stat='SCORE'
elif stat=='SCORE':
ret=re.search(self.re_score, line)
if ret:
bkinfo[sid]['score']=ret.group(1)
stat='RATENUM'
continue
elif stat=='RATENUM':
ret=re.search(self.re_ratenum, line)
if ret:
bkinfo[sid]['ratenum']=ret.group(1)
stat='AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
tt=ret.group(1).split(' / ')
if len(tt)>=3:
*author, bkinfo[sid]['publisher'], bkinfo[sid]['publishing']=tt
bkinfo[sid]['author']='/'.join(author)
else:
bkinfo[sid]['author']=ret[0]
stat='DESCRIPTION'
continue
elif stat=='DESCRIPTION':
ret=re.search(self.re_description, line)
if ret:
bkinfo[sid]['description']=ret.group(1).strip()
stat='SID'
continue
else: continue
else:
stat='ASIN'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
if stat=='ASIN':
ret=re.search(self.re_asin, line)
if ret:
sid=ret.group(1)
bkinfo[sid]['link'] = os.path.join(LINKPREF,ret.group(1))
stat = 'IMG'
continue
elif stat=='IMG':
ret=re.search(self.re_img, line)
if ret:
bkinfo[sid]['img'] = ret.group(1)
stat = 'BOOKNAME'
continue
elif stat=='BOOKNAME':
ret=re.search(self.re_bn, line)
if ret:
bkname = re.split(r'[(\s]',ret.group(1).strip())[0]
bkinfo[sid]['bookname'] = bkname
stat = 'AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
author = ','.join(re.split('<span.+?auto\">|<\/span', ret.group(0))[3::4])
bkinfo[sid]['author'] = author
stat = 'RATE'
continue
elif stat=='RATE':
ret=re.search(self.re_rate, line)
if ret:
bkinfo[sid]['rate'] = ret.group(1).split(' ')[0]
stat = 'AUTHOR'
continue
else: continue
if re.search(self.re_end, line):
stat=='ASIN'
continue
return [mbkn, bkinfo]
def filter_spide_book(self, mbkinfo):
"""
mbkinfo:
douban
"10530219": {
"link": "https://book.douban.com/subject/10530219",
"bookname": "市场的逻辑",
"img": "https://img3.doubanio.com/view/subject/s/public/s8912552.jpg",
"score": "8.3",
"ratenum": "218",
"publisher": "世纪文景 上海人民出版社",
"publishing": "2012",
"author": "张维迎"
},...}
amazon
"孟子": {
"link": "https://....B07RN73425",
"bookname": "古典名著普及文库:孟子",
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
"rate": "3.9"
"author": "孙钦善",
}
"""
#booklink - https://book.douban.com/subject/{sid}
# f1/d1: mbkn include in bookname
# f2/d2: bookname include mbkn
# f3/d3: mbkn and bookname different
[f1,f2,f3]=[0,0,0]
[d1,d2,d3] =[{},{},{}]
mbkn=mbkinfo[0]
for k,v in mbkinfo[1].items():
bkn=v['bookname']
#print('xxxx bkn {} {} len {}'.format(bkn, v, len(v)))
if len(v)==9:
if (not f1) and (mbkn in bkn):
f1=1
d1={mbkn:v}
elif (not f1) and (not f2) and (bkn in mbkn):
f2=1
d2={mbkn:v}
elif (not f3):
f3=1
d3={mbkn:v}
else: continue
else:
continue
if f1:
return d1
elif f2:
return d2
elif f3:
return d3
return None
def down_book_img(self, mbkinfo):
import os
import socket
from urllib.request import urlretrieve
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
for k,v in mbkinfo.items():
link = v['img']
if not os.path.exists(IMGPATH): os.mkdir(IMGPATH)
p=os.path.join(IMGPATH,link.split('/')[-1])
try:
img = requests.get(link, headers=headers)
if img.status_code == 200:
with open(p, 'wb') as fp:
fp.write(img.content)
except Exception as e:
print(e)
if __name__ == '__main__':
"""
# XXX move to unitest.kman.py
spide = bookInfoSpide()
for bkname in testbooks:
bkname = re.split(r'[\(\-\:_\s]',bkname.strip())[0]
print(bkname)
bkinfo = spide.grab_book_info(bkname)
filter_bkinfo = spide.filter_spide_book(bkinfo)
if filter_bkinfo: spide.down_book_img(filter_bkinfo)
logger.debug('================ {} ================'.format(bkname))
logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))
"""
pass