#########################################################
## @file : tbook.py
## @desc : douban book spide
## @create : 2020/6/13
## @author : Chengan
## @email : douboer@gmail.com
#########################################################
import requests
import json
import re
import os
import subprocess
import logging
from collections import defaultdict
# log info
logger = logging.getLogger()
logger.addHandler(logging.FileHandler('log'))
logger.setLevel(logging.DEBUG)
spidetp = 1 # 0 - douban 1- amazon
mheaders = {
'Host': 'www.douban.com',
'Referer': 'http://www.douban.com',
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
}
mparams = {}
murl = ""
if spidetp==0:
mparams['Host']='www.douban.com',
mparams['search_text'] = 'bkname_xxx'
mparams['cat']='1001'
mparams['k']='bookname_xxx'
murl = "https://search.douban.com/book/subject_search"
else:
mheaders['Host']='www.amazon.cn'
mheaders['Referer']='http:/www.amazon.cn'
#https://www.amazon.cn/s?k={bookname}&i=stripbooks&__mk_zh_CN=亚马逊网站&ref=nb_sb_noss
mparams['Host']='www.amazon.cn'
mparams['k']='bkname_xxx'
mparams['i']='stripbooks'
mparams['__mk_zh_CN=']='亚马逊网站'
mparams['reg']='nb_sb_noss'
murl = 'https://www.amazon.cn/s'
testbooks =['24堂财富课',
'甲骨文',
'庆余年(精校版)',
'商君书-中华经典名著全本全注全译丛书',
'苏世民:我的经验与教训(2018读桥水达利欧的原则,2020看黑石苏世民的经验!一本书读懂从白手起家到华尔街新国王的传奇人生)',
'杨伯峻_论语译注',
'小窗幽记',
'少年凯歌',
'投资要义',
'白鱼解字',
'历史的巨镜',
'货币的教训',
'钱从哪里来',
'中国古代简史',
'罗马人的故事(套装共15册)',
'改变心理学的40项研究',
'如何假装懂音乐',
'管子(上下册)--中华经典名著全本全注全译(精)',
'投资中最简单的事',
'薛兆丰经济学讲义',
'枪炮、病菌与钢铁:人类社会的命运(世纪人文系列丛书·开放人文)',
'中央帝国的哲学密码',
'新编说文解字大全集(超值白金版)',
'市场的逻辑(增订本)',
'金融的本质:伯南克四讲美联储(看一个风云人物的金融思考)',
'从零开始学写作:个人增值的有效方法',
'中国国家治理的制度逻辑:一个组织学研究',
'中国为什么有前途:对外经济关系的战略潜能(第3版)',
'日本的世界观(《剑桥日本史》主编凝练之作三个人物故事串起日本两百年变局了解近代日本转向的必读之书理想国出品))']
testresp = """
9.4
(145人评价)
林毅夫 / 北京大学出版社 / 2018
《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...
div class="result">
9.4
(145人评价)
林毅夫 / 北京大学出版社 / 2018
《解读中国经济》是解读中国经济之作,总结了中国与其他国家、地区经济发展和改革活动的经验,提出了一个经济发展和转型的一般理论,并以此理论分析中国在改革和发展过...
"""
class doubanSpide():
"""
re_bn = re.compile(r'''
class=\"nbg.+?sid: (\d+?),.+?
title=\"(.+?)\".+?
img src=\"(.+?)\".+?
rating_nums\">(.+?)<
''',flags=re.S|re.X)
"""
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
if spidetp==0:
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
re_norate = re.compile(r'''class=\"allstar00\"''')
re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''')
else:
re_asin = re.compile(r'''^<\/span>.+$''')
re_rate = re.compile(r'''^$''')
#re_end = re.compile(r'''<\/body><\/html>''')
re_end = re.compile(r'''^<\/span><\/div><\/div>''')
#amazon ASIN B07VKS1DRZ - https://www.amazon.cn/s?k=B07VKS1DRZ
def __init__(self):
pass
def parse_books(self, mbkn: str):
"""mbkn - bookname to be spided
return: {
"25853071": { # sid
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
},...}
"""
if spidetp==0: #douban
mparams['search_text'] = mbkn
else: #amazon
mparams['k'] = mbkn
r = requests.get( url=murl, headers=mheaders, params=mparams)
if r.status_code != 200:
raise Exception("请求失败")
bkinfo = defaultdict(dict)
sid = None
stat = None
resp = r.text
if spidetp==0:
stat = 'SID'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
if stat=='SID':
ret=re.search(self.re_bn, line)
if ret:
sid = ret.group(1)
bkinfo[sid]['bookname'] = ret.group(2)
bkinfo[sid]['img'] = ret.group(3)
stat = 'RATE'
continue
elif stat=='RATE':
# if no rate, goto next bookname state
if re.search(self.re_norate, line):
stat = 'SID'
continue
ret=re.search(self.re_rate, line)
if ret:
bkinfo[sid]['rate'] = ret.group(1)
stat = 'AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
stat = 'SID'
else: continue
else:
stat='ASIN'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
if stat=='ASIN':
ret=re.search(self.re_asin, line)
if ret:
sid = ret.group(1)
stat = 'IMG'
continue
elif stat=='IMG':
ret=re.search(self.re_img, line)
if ret:
bkinfo[sid]['img'] = ret.group(1)
stat = 'BOOKNAME'
continue
elif stat=='BOOKNAME':
ret=re.search(self.re_bn, line)
if ret:
bkname = re.split(r'[((\s]',ret.group(1).strip())[0]
bkinfo[sid]['bookname'] = bkname
stat = 'AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
author = ','.join(re.split('|<\/span', ret.group(0))[3::4])
bkinfo[sid]['author'] = author
stat = 'RATE'
continue
elif stat=='RATE':
ret=re.search(self.re_rate, line)
if ret:
bkinfo[sid]['rate'] = ret.group(1).split(' ')[0]
stat = 'AUTHOR'
continue
else: continue
if re.search(self.re_end, line):
stat=='ASIN'
continue
return bkinfo
def filter_spide_books(self, mbkn, mbkinfo):
""" mbkn - bookname to be spide
mbkinfo:
douban
{
"25853071": { # sid
"bookname": "庆余年",
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
"rate": "8.0",
"author": "猫腻"
},...}
amazon
"B07RN73425": {
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
"bookname": "古典名著普及文库:孟子",
"author": "孙钦善",
"rate": "3.9"
}
"""
#booklink - https://book.douban.com/subject/{sid}
# f1/d1: mbkn include in bookname
# f2/d2: bookname include mbkn
# f3/d3: mbkn and bookname different
[f1,f2,f3] = [0,0,0]
[d1,d2,d3] = [{},{},{}]
for k,v in mbkinfo.items():
bkn = v['bookname']
if len(v) == 4:
if (not f1) and (mbkn in bkn):
f1 = 1
d1 = {k:v}
elif (not f1) and (not f2) and (bkn in mbkn):
f2 = 1
d2 = {k:v}
elif (not f3):
f3 = 1
d3 = {k:v}
else: continue
else:
continue
if f1:
return d1
elif f2:
return d2
elif f3:
return d3
return 0
if __name__ == '__main__':
spide = doubanSpide()
for bkname in testbooks:
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
print(bkname)
bkinfo = spide.parse_books(bkname)
filter_bkinfo = spide.filter_spide_books(bkname, bkinfo)
logger.debug('================ {} ================'.format(bkname))
#logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))