diff --git a/tbook.py b/tbook.py index 63bb89a..08fd083 100644 --- a/tbook.py +++ b/tbook.py @@ -138,7 +138,7 @@ class doubanSpide(): """ - [re_bn,re_bn,re_rate,re_norate,re_author] = [None,None,None,None,None] + [re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None] if spidetp==0: re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S) re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''') @@ -146,12 +146,15 @@ class doubanSpide(): re_norate = re.compile(r'''class=\"allstar00\"''') re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''') else: + re_asin = re.compile(r'''^
(.+?)<''') - re_norate = re.compile(r'''class=\"allstar00\"''') - re_author = re.compile(r'''class=\"subject-cast\">(.+?)<''') + re_author = re.compile(r'''^
<\/span>.+$''') + re_rate = re.compile(r'''^$''') + #re_end = re.compile(r'''<\/body><\/html>''') + re_end = re.compile(r'''^<\/span><\/div><\/div>''') + + #amazon ASIN B07VKS1DRZ - https://www.amazon.cn/s?k=B07VKS1DRZ def __init__(self): pass @@ -171,6 +174,7 @@ class doubanSpide(): mparams['search_text'] = mbkn else: #amazon mparams['k'] = mbkn + r = requests.get( url=murl, headers=mheaders, params=mparams) if r.status_code != 200: @@ -178,16 +182,17 @@ class doubanSpide(): bkinfo = defaultdict(dict) sid = None - stat = 'NAME' + stat = None resp = r.text if spidetp==0: + stat = 'SID' for line in resp.split('\n'): line = line.strip() if line=='': continue - if stat=='NAME': + if stat=='SID': ret=re.search(self.re_bn, line) if ret: sid = ret.group(1) @@ -198,7 +203,7 @@ class doubanSpide(): elif stat=='RATE': # if no rate, goto next bookname state if re.search(self.re_norate, line): - stat = 'NAME' + stat = 'SID' continue ret=re.search(self.re_rate, line) if ret: @@ -209,30 +214,78 @@ class doubanSpide(): ret=re.search(self.re_author, line) if ret: bkinfo[sid]['author'] = ret.group(1).split(' ')[0] - stat = 'NAME' - continue + stat = 'SID' else: continue else: - logger.debug('================ {} ================'.format(mbkn)) + stat='ASIN' for line in resp.split('\n'): line = line.strip() if line=='': continue - logger.debug(line) + if stat=='ASIN': + ret=re.search(self.re_asin, line) + if ret: + sid = ret.group(1) + stat = 'IMG' + continue + elif stat=='IMG': + ret=re.search(self.re_img, line) + if ret: + bkinfo[sid]['img'] = ret.group(1) + stat = 'BOOKNAME' + continue + elif stat=='BOOKNAME': + ret=re.search(self.re_bn, line) + if ret: + bkname = re.split(r'[((\s]',ret.group(1).strip())[0] + bkinfo[sid]['bookname'] = bkname + stat = 'AUTHOR' + continue + elif stat=='AUTHOR': + ret=re.search(self.re_author, line) + if ret: + author = ','.join(re.split('|<\/span', ret.group(0))[3::4]) + bkinfo[sid]['author'] = author + stat = 'RATE' + continue + elif stat=='RATE': + ret=re.search(self.re_rate, line) + if ret: + bkinfo[sid]['rate'] = ret.group(1).split(' ')[0] + stat = 'AUTHOR' + continue + else: continue + + if re.search(self.re_end, line): + stat=='ASIN' + continue return bkinfo def filter_spide_books(self, mbkn, mbkinfo): """ mbkn - bookname to be spide - mbkinfo: { + mbkinfo: + douban + { "25853071": { # sid - "bookname": "庆余年", - "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg", - "rate": "8.0", - "author": "猫腻" + "bookname": "庆余年", + "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg", + "rate": "8.0", + "author": "猫腻" },...} + amazon + "B07RN73425": { + "img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg", + "bookname": "古典名著普及文库:孟子", + "author": "孙钦善", + "rate": "3.9" + } + """ #booklink - https://book.douban.com/subject/{sid} + # f1/d1: mbkn include in bookname + # f2/d2: bookname include mbkn + # f3/d3: mbkn and bookname different [f1,f2,f3] = [0,0,0] [d1,d2,d3] = [{},{},{}] for k,v in mbkinfo.items(): diff --git a/tparseamazon.py b/tparseamazon.py index 1760308..dac37f5 100644 --- a/tparseamazon.py +++ b/tparseamazon.py @@ -1,3 +1,10 @@ +######################################################### +## @file : tparseamazon.py +## @desc : test parse amazon response +## @create : 2020/05/26 +## @author : Chengan +## @email : douboer@gmail.com +######################################################### import re import json @@ -11,9 +18,33 @@ s =['''
堀田江理(Eri Hotta)
''', '''
景跃进, 张小劲余逊达
'''] +tre = ['''
''', +'''野田洋次郎蒋青青
''', +'''''', +'''
'''] + + for t in s: ret = re.split('|<\/span',t) fret = ret[3::4] #print(json.dumps(re.split('|<\/span',t), indent=2, ensure_ascii=False)) print(','.join(fret)) +re_asin = re.compile(r'''^
<\/span>.+$''') +re_rate = re.compile(r'''^$''') +#re_end = re.compile(r'''<\/body><\/html>''') +re_end = re.compile(r'''^<\/span><\/div><\/div>''') + +print(re.search(re_asin, tre[0]).group(1)) +print(re.search(re_img , tre[1]).group(1)) +print(re.search(re_bn , tre[2]).group(1)) +print(re.search(re_author,tre[3]).group(0)) +print(re.search(re_rate, tre[4]).group(1)) +print(re.search(re_end , tre[5]).group(0)) + + diff --git a/x b/x index d990b90..a2a8531 100644 --- a/x +++ b/x @@ -1,2962 +1,23 @@ 24堂财富课 - - - - - - - - - - - - - - - - - - - - - - - -亚马逊 : 24堂财富课 - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -beats - - -Kindle unlimited - - -boutique - - -BTS - - -game - - - - - - - - - - - - -
- -
- - - - - - - - - - - - - - - -
- - +甲骨文 +庆余年 +商君书 +苏世民 +杨伯峻 +小窗幽记 +少年凯歌 +投资要义 +白鱼解字 +历史的巨镜 +货币的教训 +钱从哪里来 +中国古代简史 +罗马人的故事 +改变心理学的40项研究 +如何假装懂音乐 +管子 +投资中最简单的事 +薛兆丰经济学讲义 +枪炮、病菌与钢铁 +中央帝国的哲学密码 +新编说文解字大全集