<\/span>.+$''')
+ re_rate = re.compile(r'''^$''')
+ #re_end = re.compile(r'''<\/body><\/html>''')
+ re_end = re.compile(r'''^<\/span><\/div><\/div>''')
+
+ #amazon ASIN B07VKS1DRZ - https://www.amazon.cn/s?k=B07VKS1DRZ
def __init__(self):
pass
@@ -171,6 +174,7 @@ class doubanSpide():
mparams['search_text'] = mbkn
else: #amazon
mparams['k'] = mbkn
+
r = requests.get( url=murl, headers=mheaders, params=mparams)
if r.status_code != 200:
@@ -178,16 +182,17 @@ class doubanSpide():
bkinfo = defaultdict(dict)
sid = None
- stat = 'NAME'
+ stat = None
resp = r.text
if spidetp==0:
+ stat = 'SID'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
- if stat=='NAME':
+ if stat=='SID':
ret=re.search(self.re_bn, line)
if ret:
sid = ret.group(1)
@@ -198,7 +203,7 @@ class doubanSpide():
elif stat=='RATE':
# if no rate, goto next bookname state
if re.search(self.re_norate, line):
- stat = 'NAME'
+ stat = 'SID'
continue
ret=re.search(self.re_rate, line)
if ret:
@@ -209,30 +214,78 @@ class doubanSpide():
ret=re.search(self.re_author, line)
if ret:
bkinfo[sid]['author'] = ret.group(1).split(' ')[0]
- stat = 'NAME'
- continue
+ stat = 'SID'
else: continue
else:
- logger.debug('================ {} ================'.format(mbkn))
+ stat='ASIN'
for line in resp.split('\n'):
line = line.strip()
if line=='': continue
- logger.debug(line)
+ if stat=='ASIN':
+ ret=re.search(self.re_asin, line)
+ if ret:
+ sid = ret.group(1)
+ stat = 'IMG'
+ continue
+ elif stat=='IMG':
+ ret=re.search(self.re_img, line)
+ if ret:
+ bkinfo[sid]['img'] = ret.group(1)
+ stat = 'BOOKNAME'
+ continue
+ elif stat=='BOOKNAME':
+ ret=re.search(self.re_bn, line)
+ if ret:
+ bkname = re.split(r'[((\s]',ret.group(1).strip())[0]
+ bkinfo[sid]['bookname'] = bkname
+ stat = 'AUTHOR'
+ continue
+ elif stat=='AUTHOR':
+ ret=re.search(self.re_author, line)
+ if ret:
+ author = ','.join(re.split('|<\/span', ret.group(0))[3::4])
+ bkinfo[sid]['author'] = author
+ stat = 'RATE'
+ continue
+ elif stat=='RATE':
+ ret=re.search(self.re_rate, line)
+ if ret:
+ bkinfo[sid]['rate'] = ret.group(1).split(' ')[0]
+ stat = 'AUTHOR'
+ continue
+ else: continue
+
+ if re.search(self.re_end, line):
+ stat=='ASIN'
+ continue
return bkinfo
def filter_spide_books(self, mbkn, mbkinfo):
""" mbkn - bookname to be spide
- mbkinfo: {
+ mbkinfo:
+ douban
+ {
"25853071": { # sid
- "bookname": "庆余年",
- "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
- "rate": "8.0",
- "author": "猫腻"
+ "bookname": "庆余年",
+ "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
+ "rate": "8.0",
+ "author": "猫腻"
},...}
+ amazon
+ "B07RN73425": {
+ "img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
+ "bookname": "古典名著普及文库:孟子",
+ "author": "孙钦善",
+ "rate": "3.9"
+ }
+
"""
#booklink - https://book.douban.com/subject/{sid}
+ # f1/d1: mbkn include in bookname
+ # f2/d2: bookname include mbkn
+ # f3/d3: mbkn and bookname different
[f1,f2,f3] = [0,0,0]
[d1,d2,d3] = [{},{},{}]
for k,v in mbkinfo.items():
diff --git a/tparseamazon.py b/tparseamazon.py
index 1760308..dac37f5 100644
--- a/tparseamazon.py
+++ b/tparseamazon.py
@@ -1,3 +1,10 @@
+#########################################################
+## @file : tparseamazon.py
+## @desc : test parse amazon response
+## @create : 2020/05/26
+## @author : Chengan
+## @email : douboer@gmail.com
+#########################################################
import re
import json
@@ -11,9 +18,33 @@ s =['''
堀田江理(Eri Hotta)
''',
'''
景跃进, 张小劲、 余逊达
''']
+tre = ['''
''',
+'''野田洋次郎、 蒋青青
''',
+'''''',
+'''
''']
+
+
for t in s:
ret = re.split('|<\/span',t)
fret = ret[3::4]
#print(json.dumps(re.split('|<\/span',t), indent=2, ensure_ascii=False))
print(','.join(fret))
+re_asin = re.compile(r'''^