murl = ""
#########################################################
## @file : parseweb.py (refactored)
## @desc : Douban & Amazon book spider (Douban primary)
#########################################################
import requests
import re
import os
import logging
from collections import defaultdict
from html import unescape
from urllib.parse import quote
logger = logging.getLogger()
if not logger.handlers:
logger.addHandler(logging.FileHandler('log'))
logger.setLevel(logging.INFO)
ISDOUBAN = 1
IMGPATH = './downimg'
LINKPREF = 'https://book.douban.com/subject/' if ISDOUBAN else 'https://www.amazon.cn/s?k='
mheaders = {
'Host': 'www.douban.com',
'Referer': 'https://www.douban.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36'
}
mparams = {}
murl = ''
if ISDOUBAN:
mparams['Host'] = 'www.douban.com'
mparams['search_text'] = 'bkname_xxx'
mparams['cat'] = '1001'
mparams['k'] = 'bookname_xxx'
murl = 'https://search.douban.com/book/subject_search'
else:
mheaders['Host'] = 'www.amazon.cn'
mheaders['Referer'] = 'https://www.amazon.cn'
mparams['Host'] = 'www.amazon.cn'
mparams['k'] = 'bkname_xxx'
mparams['i'] = 'stripbooks'
mparams['__mk_zh_CN='] = '亚马逊网站'
mparams['reg'] = 'nb_sb_noss'
murl = 'https://www.amazon.cn/s'
TEST_BOOKS = [
'24堂财富课','甲骨文','庆余年(精校版)','商君书-中华经典名著全本全注全译丛书','苏世民:我的经验与教训',
'杨伯峻_论语译注','小窗幽记','少年凯歌','投资要义','白鱼解字','历史的巨镜','货币的教训','钱从哪里来',
'中国古代简史','罗马人的故事(套装共15册)','改变心理学的40项研究','如何假装懂音乐','管子(上下册)',
'投资中最简单的事','薛兆丰经济学讲义','枪炮、病菌与钢铁:人类社会的命运','中央帝国的哲学密码','新编说文解字大全集',
'市场的逻辑(增订本)','金融的本质:伯南克四讲美联储','从零开始学写作','中国国家治理的制度逻辑','中国为什么有前途','日本的世界观'
]
class BookInfoSpider:
if ISDOUBAN:
re_bn = re.compile(r'class="nbg.+?sid: (\d+?),.+?title="(.+?)".+?img src="(.+?)"')
re_star = re.compile(r'^')
re_score = re.compile(r'class="rating_nums">(.+?)<')
re_ratenum = re.compile(r'^\((\d+)人评价\)')
re_author = re.compile(r'class="subject-cast">(.+?)<')
re_description = re.compile(r'^
(.+?)(
){0,1}$')
else:
re_asin = re.compile(r'^<\/span>.+$')
re_rate = re.compile(r'^$')
re_end = re.compile(r'^<\/span><\/div><\/div>')
def _douban_suggest(self, query: str):
"""调用豆瓣 suggest 接口返回列表[{id,title,url,pic}]"""
session = requests.Session()
session.headers.update(mheaders)
# 先访问主页获取 cookies
try:
session.get('https://www.douban.com/', timeout=10)
except:
pass
url = f'https://book.douban.com/j/subject_suggest?q={quote(query)}'
try:
r = session.get(url, timeout=8)
if r.status_code == 200:
return r.json()
except Exception as e:
logger.debug(f"suggest error {query}: {e}")
return []
def _fetch_subject(self, sid: str):
url = f'https://book.douban.com/subject/{sid}/'
try:
r = requests.get(url, headers=mheaders, timeout=10)
if r.status_code == 200:
return r.text
except Exception as e:
logger.debug(f'subject fetch {sid} error: {e}')
return ''
def _extract_description(self, html: str) -> str:
if not html:
return ''
m = re.search(r'(.*?)
', html, re.S)
if m:
raw = m.group(1)
ps = re.findall(r'(.*?)
', raw, re.S)
txt = '\n'.join(unescape(re.sub(r'<.*?>','', p)).strip() for p in ps)
if txt:
return txt
return ''
def grab_book_info_new(self, bookname: str):
if ISDOUBAN != 1:
return None
suggestions = self._douban_suggest(bookname)
if not suggestions:
return None
def score(item):
title = item.get('title','')
if bookname in title:
return 3*len(bookname)/max(len(title),1)
if title in bookname:
return 2*len(title)/max(len(bookname),1)
return len(set(bookname) & set(title)) / max(len(title),1)
suggestions.sort(key=score, reverse=True)
chosen = suggestions[0]
sid = str(chosen.get('id'))
html = self._fetch_subject(sid)
desc = self._extract_description(html)
author=''; publisher=''; publishing=''
info_blk = re.search(r'(.*?)
', html, re.S)
if info_blk:
info_txt = re.sub(r'
','\n', info_blk.group(1))
info_txt = re.sub(r'<.*?>','', info_txt)
lines = [l.strip() for l in info_txt.split('\n') if l.strip()]
for line in lines:
if line.startswith('作者') and not author:
author = line.split(':',1)[-1].strip()
elif line.startswith('出版社') and not publisher:
publisher = line.split(':',1)[-1].strip()
elif re.search(r'出版年', line) and not publishing:
publishing = line.split(':',1)[-1].strip()
bkinfo = defaultdict(dict)
bkinfo[sid]['link'] = f'https://book.douban.com/subject/{sid}'
bkinfo[sid]['bookname'] = chosen.get('title','')
bkinfo[sid]['img'] = chosen.get('pic','')
if desc: bkinfo[sid]['description'] = desc
if author: bkinfo[sid]['author'] = author
if publisher: bkinfo[sid]['publisher'] = publisher
if publishing: bkinfo[sid]['publishing'] = publishing
return [bookname, bkinfo]
def grab_book_info(self, mbkn: str):
if ISDOUBAN==1:
mparams['search_text'] = mbkn
mparams['k'] = mbkn
else:
mparams['k'] = mbkn
session = requests.Session()
session.headers.update(mheaders)
# 先访问主页获取 cookies
try:
session.get('https://www.douban.com/', timeout=10)
except:
pass
try:
r = session.get(murl, params=mparams, timeout=10)
except Exception as e:
logger.debug(f'request error {mbkn}: {e}')
return [mbkn, defaultdict(dict)]
if r.status_code != 200:
logger.debug(f'status {r.status_code} for {mbkn}')
bkinfo = defaultdict(dict)
sid=None; stat=None
resp = r.text
if ISDOUBAN==1:
stat='SID'
for line in resp.split('\n'):
line=line.strip()
if not line: continue
if stat=='SID':
ret=re.search(self.re_bn, line)
if ret:
sid=ret.group(1)
bkinfo[sid]['link']=os.path.join(LINKPREF,sid)
bkinfo[sid]['bookname']=ret.group(2)
bkinfo[sid]['img']=ret.group(3)
stat='STAR'
continue
elif stat=='STAR':
ret=re.search(self.re_star, line)
if ret:
star=ret.group(1)
if star=='00':
stat='AUTHOR'
elif star.isdigit() and int(star)>0:
stat='SCORE'
elif stat=='SCORE':
ret=re.search(self.re_score, line)
if ret:
bkinfo[sid]['score']=ret.group(1)
stat='RATENUM'
continue
elif stat=='RATENUM':
ret=re.search(self.re_ratenum, line)
if ret:
bkinfo[sid]['ratenum']=ret.group(1)
stat='AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
tt=ret.group(1).split(' / ')
if len(tt)>=3:
*author, bkinfo[sid]['publisher'], bkinfo[sid]['publishing']=tt
bkinfo[sid]['author']='/'.join(author)
else:
bkinfo[sid]['author']=ret.group(1)
stat='DESCRIPTION'
continue
elif stat=='DESCRIPTION':
ret=re.search(self.re_description, line)
if ret:
bkinfo[sid]['description']=ret.group(1).strip()
stat='SID'
continue
else: # AMAZON
stat='ASIN'
for line in resp.split('\n'):
line=line.strip()
if not line: continue
if stat=='ASIN':
ret=re.search(self.re_asin, line)
if ret:
sid=ret.group(1)
bkinfo[sid]['link']=os.path.join(LINKPREF,ret.group(1))
stat='IMG'
continue
elif stat=='IMG':
ret=re.search(self.re_img, line)
if ret:
bkinfo[sid]['img']=ret.group(1)
stat='BOOKNAME'
continue
elif stat=='BOOKNAME':
ret=re.search(self.re_bn, line)
if ret:
bkname=re.split(r'[((\s]',ret.group(1).strip())[0]
bkinfo[sid]['bookname']=bkname
stat='AUTHOR'
continue
elif stat=='AUTHOR':
ret=re.search(self.re_author, line)
if ret:
author=','.join(re.split(r'|<\/span', ret.group(0))[3::4])
bkinfo[sid]['author']=author
stat='RATE'
continue
elif stat=='RATE':
ret=re.search(self.re_rate, line)
if ret:
bkinfo[sid]['rate']=ret.group(1).split(' ')[0]
stat='AUTHOR'
continue
if re.search(self.re_end, line):
stat='ASIN'
return [mbkn, bkinfo]
def filter_spide_book(self, mbkinfo):
if not mbkinfo: return None
mbkn=mbkinfo[0]
best=None
for sid,v in mbkinfo[1].items():
bkn=v.get('bookname','')
if not bkn: continue
score=0
if mbkn in bkn: score+=3
if bkn in mbkn: score+=2
score+=len(set(mbkn)&set(bkn))*0.01
if (not best) or score>best[0]:
best=(score,{mbkn:v})
return best[1] if best else None
def down_book_img(self, mbkinfo):
if not mbkinfo: return
if not os.path.exists(IMGPATH): os.mkdir(IMGPATH)
for _,v in mbkinfo.items():
link=v.get('img')
if not link: continue
fname=link.split('/')[-1]
p=os.path.join(IMGPATH,fname)
if os.path.exists(p):
continue
try:
img=requests.get(link, headers=mheaders, timeout=10)
if img.status_code==200:
with open(p,'wb') as fp:
fp.write(img.content)
except Exception as e:
logger.debug(f'download img error {link}: {e}')