51 lines
4.6 KiB
Python
51 lines
4.6 KiB
Python
#########################################################
|
|
## @file : tparseamazon.py
|
|
## @desc : test parse amazon response
|
|
## @create : 2020/05/26
|
|
## @author : Chengan
|
|
## @email : douboer@gmail.com
|
|
#########################################################
|
|
|
|
import re
|
|
import json
|
|
|
|
s =['''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">丹·琼斯(Dan Jones)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">杰弗里·瓦夫罗(Geoffrey Wawro)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">克里斯托弗·希伯特(Christopher Hibbert)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">罗斯·金(Ross King)</span><span class="a-size-base" dir="auto">等等。</span></div>''',
|
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">马克·哈里斯</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">黎绮妮</span></div>''',
|
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">马克·哈里斯</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">黎绮妮</span></div>''',
|
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[美]威廉·厄本(William Urban)</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">陆大鹏</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">刘晓晖</span></div>''',
|
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[英]安德鲁·罗伯茨(Andrew Roberts)</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">苏然</span></div>''',
|
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">[英]安德鲁·罗伯茨(Andrew Roberts)</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">苏然</span></div>''',
|
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">堀田江理(Eri Hotta)</span></div>''',
|
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">景跃进</span><span class="a-size-base" dir="auto">, </span><span class="a-size-base" dir="auto">张小劲</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">余逊达</span></div>''']
|
|
|
|
tre = ['''<div data-asin="B07S46LMCK" data-index="3" data-uuid="ef76dd52-2ab2-4372-8597-2c3258698b6e" data-component-type="s-search-result" class="sg-col-20-of-24 s-result-item s-asin sg-col-0-of-12 sg-col-28-of-32 sg-col-16-of-20 sg-col sg-col-32-of-36 sg-col-12-of-16 sg-col-24-of-28"><div class="sg-col-inner">''',
|
|
'''<img src="https://images-cn.ssl-images-amazon.com/images/I/41JFUwuJPCL._AC_UY218_.jpg"''',
|
|
'''alt="小窗幽记(国学大书院)(为人处世的智慧之果 修身齐家的行动指南)"''',
|
|
'''<div class="a-row a-size-base a-color-secondary"><span class="a-size-base" dir="auto"></span><span class="a-size-base" dir="auto">野田洋次郎</span><span class="a-size-base" dir="auto">、 </span><span class="a-size-base" dir="auto">蒋青青</span></div>''',
|
|
'''<span aria-label="4.7 颗星,最多 5 颗星">''',
|
|
'''<span class="a-letter-space"></span></div></div>''']
|
|
|
|
|
|
for t in s:
|
|
ret = re.split('<span.+?auto\">|<\/span',t)
|
|
fret = ret[3::4]
|
|
#print(json.dumps(re.split('<span.+?auto\">|<\/span',t), indent=2, ensure_ascii=False))
|
|
print(','.join(fret))
|
|
|
|
re_asin = re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
|
|
re_img = re.compile(r'''^<img src=\"(.+?)\"$''')
|
|
re_bn = re.compile(r'''^alt=\"(.+?)\"$''')
|
|
re_author = re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
|
|
re_rate = re.compile(r'''^<span aria-label=\"(.+?)\">$''')
|
|
#re_end = re.compile(r'''<\/body><\/html>''')
|
|
re_end = re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')
|
|
|
|
print(re.search(re_asin, tre[0]).group(1))
|
|
print(re.search(re_img , tre[1]).group(1))
|
|
print(re.search(re_bn , tre[2]).group(1))
|
|
print(re.search(re_author,tre[3]).group(0))
|
|
print(re.search(re_rate, tre[4]).group(1))
|
|
print(re.search(re_end , tre[5]).group(0))
|
|
|
|
|