murl = "" ######################################################### ## @file : parseweb.py (refactored) ## @desc : Douban & Amazon book spider (Douban primary) ######################################################### import requests import re import os import logging from collections import defaultdict from html import unescape from urllib.parse import quote logger = logging.getLogger() if not logger.handlers: logger.addHandler(logging.FileHandler('log')) logger.setLevel(logging.INFO) ISDOUBAN = 1 IMGPATH = './downimg' LINKPREF = 'https://book.douban.com/subject/' if ISDOUBAN else 'https://www.amazon.cn/s?k=' mheaders = { 'Host': 'www.douban.com', 'Referer': 'https://www.douban.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36' } mparams = {} murl = '' if ISDOUBAN: mparams['Host'] = 'www.douban.com' mparams['search_text'] = 'bkname_xxx' mparams['cat'] = '1001' mparams['k'] = 'bookname_xxx' murl = 'https://search.douban.com/book/subject_search' else: mheaders['Host'] = 'www.amazon.cn' mheaders['Referer'] = 'https://www.amazon.cn' mparams['Host'] = 'www.amazon.cn' mparams['k'] = 'bkname_xxx' mparams['i'] = 'stripbooks' mparams['__mk_zh_CN='] = '亚马逊网站' mparams['reg'] = 'nb_sb_noss' murl = 'https://www.amazon.cn/s' TEST_BOOKS = [ '24堂财富课','甲骨文','庆余年(精校版)','商君书-中华经典名著全本全注全译丛书','苏世民:我的经验与教训', '杨伯峻_论语译注','小窗幽记','少年凯歌','投资要义','白鱼解字','历史的巨镜','货币的教训','钱从哪里来', '中国古代简史','罗马人的故事(套装共15册)','改变心理学的40项研究','如何假装懂音乐','管子(上下册)', '投资中最简单的事','薛兆丰经济学讲义','枪炮、病菌与钢铁:人类社会的命运','中央帝国的哲学密码','新编说文解字大全集', '市场的逻辑(增订本)','金融的本质:伯南克四讲美联储','从零开始学写作','中国国家治理的制度逻辑','中国为什么有前途','日本的世界观' ] class BookInfoSpider: if ISDOUBAN: re_bn = re.compile(r'class="nbg.+?sid: (\d+?),.+?title="(.+?)".+?img src="(.+?)"') re_star = re.compile(r'^') re_score = re.compile(r'class="rating_nums">(.+?)<') re_ratenum = re.compile(r'^\((\d+)人评价\)') re_author = re.compile(r'class="subject-cast">(.+?)<') re_description = re.compile(r'^

(.+?)(

){0,1}$') else: re_asin = re.compile(r'^
<\/span>.+$') re_rate = re.compile(r'^$') re_end = re.compile(r'^<\/span><\/div><\/div>') def _douban_suggest(self, query: str): """调用豆瓣 suggest 接口返回列表[{id,title,url,pic}]""" session = requests.Session() session.headers.update(mheaders) # 先访问主页获取 cookies try: session.get('https://www.douban.com/', timeout=10) except: pass url = f'https://book.douban.com/j/subject_suggest?q={quote(query)}' try: r = session.get(url, timeout=8) if r.status_code == 200: return r.json() except Exception as e: logger.debug(f"suggest error {query}: {e}") return [] def _fetch_subject(self, sid: str): url = f'https://book.douban.com/subject/{sid}/' try: r = requests.get(url, headers=mheaders, timeout=10) if r.status_code == 200: return r.text except Exception as e: logger.debug(f'subject fetch {sid} error: {e}') return '' def _extract_description(self, html: str) -> str: if not html: return '' m = re.search(r'(.*?)
', html, re.S) if m: raw = m.group(1) ps = re.findall(r'

(.*?)

', raw, re.S) txt = '\n'.join(unescape(re.sub(r'<.*?>','', p)).strip() for p in ps) if txt: return txt return '' def grab_book_info_new(self, bookname: str): if ISDOUBAN != 1: return None suggestions = self._douban_suggest(bookname) if not suggestions: return None def score(item): title = item.get('title','') if bookname in title: return 3*len(bookname)/max(len(title),1) if title in bookname: return 2*len(title)/max(len(bookname),1) return len(set(bookname) & set(title)) / max(len(title),1) suggestions.sort(key=score, reverse=True) chosen = suggestions[0] sid = str(chosen.get('id')) html = self._fetch_subject(sid) desc = self._extract_description(html) author=''; publisher=''; publishing='' info_blk = re.search(r'
(.*?)
', html, re.S) if info_blk: info_txt = re.sub(r'','\n', info_blk.group(1)) info_txt = re.sub(r'<.*?>','', info_txt) lines = [l.strip() for l in info_txt.split('\n') if l.strip()] for line in lines: if line.startswith('作者') and not author: author = line.split(':',1)[-1].strip() elif line.startswith('出版社') and not publisher: publisher = line.split(':',1)[-1].strip() elif re.search(r'出版年', line) and not publishing: publishing = line.split(':',1)[-1].strip() bkinfo = defaultdict(dict) bkinfo[sid]['link'] = f'https://book.douban.com/subject/{sid}' bkinfo[sid]['bookname'] = chosen.get('title','') bkinfo[sid]['img'] = chosen.get('pic','') if desc: bkinfo[sid]['description'] = desc if author: bkinfo[sid]['author'] = author if publisher: bkinfo[sid]['publisher'] = publisher if publishing: bkinfo[sid]['publishing'] = publishing return [bookname, bkinfo] def grab_book_info(self, mbkn: str): if ISDOUBAN==1: mparams['search_text'] = mbkn mparams['k'] = mbkn else: mparams['k'] = mbkn session = requests.Session() session.headers.update(mheaders) # 先访问主页获取 cookies try: session.get('https://www.douban.com/', timeout=10) except: pass try: r = session.get(murl, params=mparams, timeout=10) except Exception as e: logger.debug(f'request error {mbkn}: {e}') return [mbkn, defaultdict(dict)] if r.status_code != 200: logger.debug(f'status {r.status_code} for {mbkn}') bkinfo = defaultdict(dict) sid=None; stat=None resp = r.text if ISDOUBAN==1: stat='SID' for line in resp.split('\n'): line=line.strip() if not line: continue if stat=='SID': ret=re.search(self.re_bn, line) if ret: sid=ret.group(1) bkinfo[sid]['link']=os.path.join(LINKPREF,sid) bkinfo[sid]['bookname']=ret.group(2) bkinfo[sid]['img']=ret.group(3) stat='STAR' continue elif stat=='STAR': ret=re.search(self.re_star, line) if ret: star=ret.group(1) if star=='00': stat='AUTHOR' elif star.isdigit() and int(star)>0: stat='SCORE' elif stat=='SCORE': ret=re.search(self.re_score, line) if ret: bkinfo[sid]['score']=ret.group(1) stat='RATENUM' continue elif stat=='RATENUM': ret=re.search(self.re_ratenum, line) if ret: bkinfo[sid]['ratenum']=ret.group(1) stat='AUTHOR' continue elif stat=='AUTHOR': ret=re.search(self.re_author, line) if ret: tt=ret.group(1).split(' / ') if len(tt)>=3: *author, bkinfo[sid]['publisher'], bkinfo[sid]['publishing']=tt bkinfo[sid]['author']='/'.join(author) else: bkinfo[sid]['author']=ret.group(1) stat='DESCRIPTION' continue elif stat=='DESCRIPTION': ret=re.search(self.re_description, line) if ret: bkinfo[sid]['description']=ret.group(1).strip() stat='SID' continue else: # AMAZON stat='ASIN' for line in resp.split('\n'): line=line.strip() if not line: continue if stat=='ASIN': ret=re.search(self.re_asin, line) if ret: sid=ret.group(1) bkinfo[sid]['link']=os.path.join(LINKPREF,ret.group(1)) stat='IMG' continue elif stat=='IMG': ret=re.search(self.re_img, line) if ret: bkinfo[sid]['img']=ret.group(1) stat='BOOKNAME' continue elif stat=='BOOKNAME': ret=re.search(self.re_bn, line) if ret: bkname=re.split(r'[((\s]',ret.group(1).strip())[0] bkinfo[sid]['bookname']=bkname stat='AUTHOR' continue elif stat=='AUTHOR': ret=re.search(self.re_author, line) if ret: author=','.join(re.split(r'|<\/span', ret.group(0))[3::4]) bkinfo[sid]['author']=author stat='RATE' continue elif stat=='RATE': ret=re.search(self.re_rate, line) if ret: bkinfo[sid]['rate']=ret.group(1).split(' ')[0] stat='AUTHOR' continue if re.search(self.re_end, line): stat='ASIN' return [mbkn, bkinfo] def filter_spide_book(self, mbkinfo): if not mbkinfo: return None mbkn=mbkinfo[0] best=None for sid,v in mbkinfo[1].items(): bkn=v.get('bookname','') if not bkn: continue score=0 if mbkn in bkn: score+=3 if bkn in mbkn: score+=2 score+=len(set(mbkn)&set(bkn))*0.01 if (not best) or score>best[0]: best=(score,{mbkn:v}) return best[1] if best else None def down_book_img(self, mbkinfo): if not mbkinfo: return if not os.path.exists(IMGPATH): os.mkdir(IMGPATH) for _,v in mbkinfo.items(): link=v.get('img') if not link: continue fname=link.split('/')[-1] p=os.path.join(IMGPATH,fname) if os.path.exists(p): continue try: img=requests.get(link, headers=mheaders, timeout=10) if img.status_code==200: with open(p,'wb') as fp: fp.write(img.content) except Exception as e: logger.debug(f'download img error {link}: {e}')