Files
kman/parseweb.py
2020-06-15 21:28:17 +08:00

376 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#########################################################
## @file : parseweb.py
## @desc : douban and amazon book spide
## amazon ASIN B07VKS1DRZ - https://www.amazon.cn/s?k=B07VKS1DRZ
## douban SID 25742200 - https://book.douban.com/subject/25742200
## @create : 2020/6/14
## @author : Chengan
## @email : douboer@gmail.com
#########################################################
import requests
import json
import re
import os
import subprocess
import logging
from collections import defaultdict
# log info
logger = logging.getLogger()
logger.addHandler(logging.FileHandler('log'))
logger.setLevel(logging.DEBUG)
ISDOUBAN = 1
IMGPATH = './downimg'
LINKPREF = 'https://book.douban.com/subject/' \
if ISDOUBAN else 'https://www.amazon.cn/s?k='
mheaders = {
'Host': 'www.douban.com',
'Referer': 'http://www.douban.com',
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
#"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
mparams = {}
murl = ""
if ISDOUBAN==1:
mparams['Host']='www.douban.com',
mparams['search_text'] = 'bkname_xxx'
mparams['cat']='1001'
mparams['k']='bookname_xxx'
murl = "https://search.douban.com/book/subject_search"
else:
mheaders['Host']='www.amazon.cn'
mheaders['Referer']='http:/www.amazon.cn'
#https://www.amazon.cn/s?k={bookname}&i=stripbooks&__mk_zh_CN=亚马逊网站&ref=nb_sb_noss
mparams['Host']='www.amazon.cn'
mparams['k']='bkname_xxx'
mparams['i']='stripbooks'
mparams['__mk_zh_CN=']='亚马逊网站'
mparams['reg']='nb_sb_noss'
murl = 'https://www.amazon.cn/s'
testbooks =['24堂财富课',
'甲骨文',
'庆余年(精校版)',
'商君书-中华经典名著全本全注全译丛书',
'苏世民:我的经验与教训2018读桥水达利欧的原则2020看黑石苏世民的经验!一本书读懂从白手起家到华尔街新国王的传奇人生)',
'杨伯峻_论语译注',
'小窗幽记',
'少年凯歌',
'投资要义',
'白鱼解字',
'历史的巨镜',
'货币的教训',
'钱从哪里来',
'中国古代简史',
'罗马人的故事(套装共15册)',
'改变心理学的40项研究',
'如何假装懂音乐',
'管子(上下册)--中华经典名著全本全注全译(精)',
'投资中最简单的事',
'薛兆丰经济学讲义',
'枪炮、病菌与钢铁:人类社会的命运(世纪人文系列丛书·开放人文)',
'中央帝国的哲学密码',
'新编说文解字大全集(超值白金版)',
'市场的逻辑(增订本)',
'金融的本质:伯南克四讲美联储(看一个风云人物的金融思考)',
'从零开始学写作:个人增值的有效方法',
'中国国家治理的制度逻辑:一个组织学研究',
'中国为什么有前途对外经济关系的战略潜能第3版',
'日本的世界观(《剑桥日本史》主编凝练之作三个人物故事串起日本两百年变局了解近代日本转向的必读之书理想国出品))']
testresp = """<div class="result">
<div class="pic">
<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&amp;query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&amp;cat_id=1001&amp;type=search&amp;pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" title="解读中国经济" ><img src="https://img3.doubanio.com/view/subject/s/public/s29872890.jpg"></a>
</div>
<div class="content">
<div class="title">
<h3>
<span>[书籍]</span>&nbsp;<a href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&amp;query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&amp;cat_id=1001&amp;type=search&amp;pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" >解读中国经济 </a>
</h3>
<div class="rating-info">
<span class="allstar45"></span>
<span class="rating_nums">9.4</span>
<span>(145人评价)</span>
<span class="subject-cast">林毅夫 / 北京大学出版社 / 2018</span>
</div>
</div>
<p>《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...</p>
</div>
</div>
div class="result">
<div class="pic">
<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&amp;query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&amp;cat_id=1001&amp;type=search&amp;pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" title="解读中国经济" ><img src="https://img3.doubanio.com/view/subject/s/public/s29872890.jpg"></a>
</div>
<div class="content">
<div class="title">
<h3>
<span>[书籍]</span>&nbsp;<a href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F30329810%2F&amp;query=%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E&amp;cat_id=1001&amp;type=search&amp;pos=1" target="_blank" onclick="moreurl(this,{i: '1', query: '%E8%A7%A3%E8%AF%BB%E4%B8%AD%E5%9B%BD%E7%BB%8F%E6%B5%8E', from: 'dou_search_book', sid: 30329810, qcat: ''})" >解读中国经济 </a>
</h3>
<div class="rating-info">
<span class="allstar45"></span>
<span class="rating_nums">9.4</span>
<span>(145人评价)</span>
<span class="subject-cast">林毅夫 / 北京大学出版社 / 2018</span>
</div>
</div>
<p>《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...</p>
</div>
</div>"""
class bookInfoSpide():
"""
re_bn = re.compile(r'''
class=\"nbg.+?sid: (\d+?),.+?
title=\"(.+?)\".+?
img src=\"(.+?)\".+?
rating_nums\">(.+?)<
''',flags=re.S|re.X)
"""
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
if ISDOUBAN==1:
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
re_norate = re.compile(r'''class=\"allstar00\"''')
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
else:
re_asin = re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
re_author = re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
re_rate = re.compile(r'''^<span aria-label=\"(.+?)\">$''')
#re_end = re.compile(r'''<\/body><\/html>''')
re_end = re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
def __init__(self):
pass
def grab_book_info(self, mbkn: str):
"""mbkn - bookname to be spided
return: {
"25853071": { # sid
"link":"https://....xxxxx"
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
},...}
"""
if ISDOUBAN==1: #douban
mparams['search_text'] = mbkn
else: #amazon
mparams['k'] = mbkn
try:
session = requests.Session()
session.header = mheaders
session.params = mparams
r = session.get( url=murl, headers=mheaders, params=mparams)
#r = requests.get( url=murl, headers=mheaders, params=mparams)
except requests.exceptions.ConnectionError:
print('ConnectionError -- please wait 3 seconds')
time.sleep(3)
except requests.exceptions.ChunkedEncodingError:
print('ChunkedEncodingError -- please wait 3 seconds')
time.sleep(3)
except:
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
time.sleep(3)
if r.status_code != 200:
print('grab book {} info from webside failure'.format(mbkn))
bkinfo = defaultdict(dict)
sid = None
stat = None
resp = r.text
if ISDOUBAN==1:
stat = 'SID'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
if stat=='SID':
ret=re.search(self.re_bn, line)
if ret:
sid = ret.group(1)
bkinfo[sid]['link'] = os.path.join(LINKPREF,sid)
bkinfo[sid]['bookname'] = ret.group(2)
bkinfo[sid]['img'] = ret.group(3)
stat = 'RATE'
continue
elif stat=='RATE':
# if no rate, goto next bookname state
if re.search(self.re_norate, line):
stat = 'SID'
continue
ret=re.search(self.re_rate, line)
if ret:
bkinfo[sid]['rate'] = ret.group(1)
stat = 'AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
stat = 'SID'
else: continue
else:
stat='ASIN'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
if stat=='ASIN':
ret=re.search(self.re_asin, line)
if ret:
sid=ret.group(1)
bkinfo[sid]['link'] = os.path.join(LINKPREF,ret.group(1))
stat = 'IMG'
continue
elif stat=='IMG':
ret=re.search(self.re_img, line)
if ret:
bkinfo[sid]['img'] = ret.group(1)
stat = 'BOOKNAME'
continue
elif stat=='BOOKNAME':
ret=re.search(self.re_bn, line)
if ret:
bkname = re.split(r'[(\s]',ret.group(1).strip())[0]
bkinfo[sid]['bookname'] = bkname
stat = 'AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
author = ','.join(re.split('<span.+?auto\">|<\/span', ret.group(0))[3::4])
bkinfo[sid]['author'] = author
stat = 'RATE'
continue
elif stat=='RATE':
ret=re.search(self.re_rate, line)
if ret:
bkinfo[sid]['rate'] = ret.group(1).split(' ')[0]
stat = 'AUTHOR'
continue
else: continue
if re.search(self.re_end, line):
stat=='ASIN'
continue
return [mbkn, bkinfo]
def filter_spide_book(self, mbkinfo):
"""
mbkinfo:
douban
{
"庆余年": {
"link":"https://....25853071",
"bookname": "庆余年xxx",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
},...}
amazon
"孟子": {
"link": "https://....B07RN73425",
"bookname": "古典名著普及文库:孟子",
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
"rate": "3.9"
"author": "孙钦善",
}
"""
#booklink - https://book.douban.com/subject/{sid}
# f1/d1: mbkn include in bookname
# f2/d2: bookname include mbkn
# f3/d3: mbkn and bookname different
[f1,f2,f3] = [0,0,0]
[d1,d2,d3] = [{},{},{}]
mbkn = mbkinfo[0]
for k,v in mbkinfo[1].items():
bkn = v['bookname']
if len(v) == 5:
if (not f1) and (mbkn in bkn):
f1 = 1
d1 = {mbkn:v}
elif (not f1) and (not f2) and (bkn in mbkn):
f2 = 1
d2 = {mbkn:v}
elif (not f3):
f3 = 1
d3 = {mbkn:v}
else: continue
else:
continue
if f1:
return d1
elif f2:
return d2
elif f3:
return d3
return None
def down_book_img(self, mbkinfo):
import os
import socket
from urllib.request import urlretrieve
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
for k,v in mbkinfo.items():
link = v['img']
if not os.path.exists(IMGPATH): os.mkdir(IMGPATH)
p=os.path.join(IMGPATH,link.split('/')[-1])
try:
img = requests.get(link, headers=headers)
if img.status_code == 200:
with open(p, 'wb') as fp:
fp.write(img.content)
except Exception as e:
print(e)
if __name__ == '__main__':
spide = bookInfoSpide()
for bkname in testbooks:
bkname = re.split(r'[\(\-\:_\s]',bkname.strip())[0]
print(bkname)
bkinfo = spide.grab_book_info(bkname)
filter_bkinfo = spide.filter_spide_book(bkinfo)
if filter_bkinfo: spide.down_book_img(filter_bkinfo)
logger.debug('================ {} ================'.format(bkname))
logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))