diff --git a/.DS_Store b/.DS_Store index fb6f918..9d4c9d5 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/changelog.md b/changelog.md index 48cd995..abc149a 100644 --- a/changelog.md +++ b/changelog.md @@ -91,9 +91,9 @@ b['1']['2'] = {'3':1} # OK - import function: - local **done** - kindle **done** - - duokan - - amazon -- link to douban + - duokan **doing** + - amazon **doing** +- link to douban and amazon **doing** - export function: - to [evernote](https://github.com/benhorvath/kindle2evernote/blob/master/kindle2evernote.py) - to web html format @@ -115,3 +115,5 @@ b['1']['2'] = {'3':1} # OK - chrome extension - clean (sdr) - sync modify to kindle device +- compare parse html info with xpath & beautisoap & regex + diff --git a/downimg/71GBLkJWDTL._AC_UY218_.jpg b/downimg/71GBLkJWDTL._AC_UY218_.jpg new file mode 100644 index 0000000..b2aa876 Binary files /dev/null and b/downimg/71GBLkJWDTL._AC_UY218_.jpg differ diff --git a/downimg/71Zs1nSJrgL._AC_UY218_.jpg b/downimg/71Zs1nSJrgL._AC_UY218_.jpg new file mode 100644 index 0000000..7add045 Binary files /dev/null and b/downimg/71Zs1nSJrgL._AC_UY218_.jpg differ diff --git a/downimg/p2575362797.jpg b/downimg/p2575362797.jpg new file mode 100644 index 0000000..8d7e02b Binary files /dev/null and b/downimg/p2575362797.jpg differ diff --git a/downimg/s1026024.jpg b/downimg/s1026024.jpg new file mode 100644 index 0000000..28d4a93 Binary files /dev/null and b/downimg/s1026024.jpg differ diff --git a/downimg/s1670329.jpg b/downimg/s1670329.jpg new file mode 100644 index 0000000..0374fb8 Binary files /dev/null and b/downimg/s1670329.jpg differ diff --git a/downimg/s1738643.jpg b/downimg/s1738643.jpg new file mode 100644 index 0000000..9f86c70 Binary files /dev/null and b/downimg/s1738643.jpg differ diff --git a/downimg/s24399718.jpg b/downimg/s24399718.jpg new file mode 100644 index 0000000..e119617 Binary files /dev/null and b/downimg/s24399718.jpg differ diff --git a/downimg/s26303695.jpg b/downimg/s26303695.jpg new file mode 100644 index 0000000..28e05c4 Binary files /dev/null and b/downimg/s26303695.jpg differ diff --git a/downimg/s2689149.jpg b/downimg/s2689149.jpg new file mode 100644 index 0000000..575fdf1 Binary files /dev/null and b/downimg/s2689149.jpg differ diff --git a/downimg/s27246465.jpg b/downimg/s27246465.jpg new file mode 100644 index 0000000..c3da780 Binary files /dev/null and b/downimg/s27246465.jpg differ diff --git a/downimg/s27276912.jpg b/downimg/s27276912.jpg new file mode 100644 index 0000000..2a9fcb8 Binary files /dev/null and b/downimg/s27276912.jpg differ diff --git a/downimg/s27653114.jpg b/downimg/s27653114.jpg new file mode 100644 index 0000000..2a4e643 Binary files /dev/null and b/downimg/s27653114.jpg differ diff --git a/downimg/s28283792.jpg b/downimg/s28283792.jpg new file mode 100644 index 0000000..25b5522 Binary files /dev/null and b/downimg/s28283792.jpg differ diff --git a/downimg/s29357535.jpg b/downimg/s29357535.jpg new file mode 100644 index 0000000..2d75a37 Binary files /dev/null and b/downimg/s29357535.jpg differ diff --git a/downimg/s29399593.jpg b/downimg/s29399593.jpg new file mode 100644 index 0000000..ff64a0d Binary files /dev/null and b/downimg/s29399593.jpg differ diff --git a/downimg/s29581756.jpg b/downimg/s29581756.jpg new file mode 100644 index 0000000..8bee945 Binary files /dev/null and b/downimg/s29581756.jpg differ diff --git a/downimg/s29811329.jpg b/downimg/s29811329.jpg new file mode 100644 index 0000000..f13cd4d Binary files /dev/null and b/downimg/s29811329.jpg differ diff --git a/downimg/s29841565.jpg b/downimg/s29841565.jpg new file mode 100644 index 0000000..12f4693 Binary files /dev/null and b/downimg/s29841565.jpg differ diff --git a/downimg/s29879195.jpg b/downimg/s29879195.jpg new file mode 100644 index 0000000..b0160aa Binary files /dev/null and b/downimg/s29879195.jpg differ diff --git a/downimg/s33314966.jpg b/downimg/s33314966.jpg new file mode 100644 index 0000000..edad265 Binary files /dev/null and b/downimg/s33314966.jpg differ diff --git a/downimg/s33551591.jpg b/downimg/s33551591.jpg new file mode 100644 index 0000000..efb93df Binary files /dev/null and b/downimg/s33551591.jpg differ diff --git a/downimg/s33559469.jpg b/downimg/s33559469.jpg new file mode 100644 index 0000000..f4e5ce1 Binary files /dev/null and b/downimg/s33559469.jpg differ diff --git a/downimg/s33587329.jpg b/downimg/s33587329.jpg new file mode 100644 index 0000000..4cc8453 Binary files /dev/null and b/downimg/s33587329.jpg differ diff --git a/downimg/s3948396.jpg b/downimg/s3948396.jpg new file mode 100644 index 0000000..449f2c0 Binary files /dev/null and b/downimg/s3948396.jpg differ diff --git a/downimg/s4052388.jpg b/downimg/s4052388.jpg new file mode 100644 index 0000000..3a660b5 Binary files /dev/null and b/downimg/s4052388.jpg differ diff --git a/downimg/s4386858.jpg b/downimg/s4386858.jpg new file mode 100644 index 0000000..5cbff45 Binary files /dev/null and b/downimg/s4386858.jpg differ diff --git a/downimg/s4397638.jpg b/downimg/s4397638.jpg new file mode 100644 index 0000000..9121d11 Binary files /dev/null and b/downimg/s4397638.jpg differ diff --git a/downimg/s5641654.jpg b/downimg/s5641654.jpg new file mode 100644 index 0000000..d449457 Binary files /dev/null and b/downimg/s5641654.jpg differ diff --git a/downimg/s6979148.jpg b/downimg/s6979148.jpg new file mode 100644 index 0000000..18cce11 Binary files /dev/null and b/downimg/s6979148.jpg differ diff --git a/downimg/s7046197.jpg b/downimg/s7046197.jpg new file mode 100644 index 0000000..48035d4 Binary files /dev/null and b/downimg/s7046197.jpg differ diff --git a/downimg/s8488837.jpg b/downimg/s8488837.jpg new file mode 100644 index 0000000..b97bf63 Binary files /dev/null and b/downimg/s8488837.jpg differ diff --git a/kmanapp.py b/kmanapp.py index d0332c3..70afb9b 100644 --- a/kmanapp.py +++ b/kmanapp.py @@ -11,6 +11,7 @@ import sys import os from time import sleep import pandas as pd +import threading from PySide2.QtWidgets import * @@ -23,6 +24,7 @@ from PySide2.QtGui import (QBrush, QColor, QConicalGradient, QCursor, QFont, from mainwindow import Ui_MainWindow from kman import * +from parseweb import * # import binary resource file(kmanapp_rc.py) import kmanapp_rc @@ -63,6 +65,8 @@ class kmanWindow(QMainWindow): self.tree_selected = 'note_root' self.km = kMan() + self.spide = bookInfoSpide() + # initial check order: # 1. backup file bk.data -> # 2. kindle(My Clippings.txt) -> @@ -520,6 +524,16 @@ class kmanWindow(QMainWindow): # stop check thread self.flag = False + def grab_all_book_info(self): + + for bkname in self.books_data.keys(): + bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0] + print(bkname) + bkinfo = self.spide.grab_book_info(bkname) + filter_bkinfo = self.spide.filter_spide_book(bkinfo) + if filter_bkinfo: + self.spide.down_book_img(filter_bkinfo) + # thanks Martin Fitzpatrick ^_^ # https://www.learnpyqt.com/courses/model-views/qtableview-modelviews-numpy-pandas/ class nTableModel(QAbstractTableModel): @@ -562,6 +576,10 @@ if __name__ == "__main__": #kmw.showFullScreen() kmw.show() + trd = threading.Thread(target=kmw.grab_all_book_info) + trd.setDaemon(True) + trd.start() + # loop check kindle is connected or not # BUG to be implement XXXX """ diff --git a/parseweb.py b/parseweb.py index 6c7b8f3..3e91cdc 100644 --- a/parseweb.py +++ b/parseweb.py @@ -22,17 +22,21 @@ logger = logging.getLogger() logger.addHandler(logging.FileHandler('log')) logger.setLevel(logging.DEBUG) -spidetp = 1 # 0 - douban 1- amazon +ISDOUBAN = 1 +IMGPATH = './downimg' +LINKPREF = 'https://book.douban.com/subject/' \ + if ISDOUBAN else 'https://www.amazon.cn/s?k=' mheaders = { 'Host': 'www.douban.com', 'Referer': 'http://www.douban.com', - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36" + 'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } +#"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36" mparams = {} murl = "" -if spidetp==0: +if ISDOUBAN==1: mparams['Host']='www.douban.com', mparams['search_text'] = 'bkname_xxx' mparams['cat']='1001' @@ -141,7 +145,7 @@ class bookInfoSpide(): [re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None] - if spidetp==0: + if ISDOUBAN==1: re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S) re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''') re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''') @@ -164,6 +168,7 @@ class bookInfoSpide(): """mbkn - bookname to be spided return: { "25853071": { # sid + "link":"https://....xxxxx" "bookname": "庆余年", "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg", "rate": "8.0", @@ -171,15 +176,32 @@ class bookInfoSpide(): },...} """ - if spidetp==0: #douban + if ISDOUBAN==1: #douban mparams['search_text'] = mbkn else: #amazon mparams['k'] = mbkn - r = requests.get( url=murl, headers=mheaders, params=mparams) + try: + s = requests.Session() + s.header = mheaders + s.params = mparams + r = s.get(murl) + #r = requests.get( url=murl, headers=mheaders, params=mparams) + + except requests.exceptions.ConnectionError: + print('ConnectionError -- please wait 3 seconds') + time.sleep(3) + + except requests.exceptions.ChunkedEncodingError: + print('ChunkedEncodingError -- please wait 3 seconds') + time.sleep(3) + + except: + print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds') + time.sleep(3) if r.status_code != 200: - raise Exception("请求失败") + print('grab book {} info from webside failure'.format(mbkn)) bkinfo = defaultdict(dict) sid = None @@ -187,7 +209,7 @@ class bookInfoSpide(): resp = r.text - if spidetp==0: + if ISDOUBAN==1: stat = 'SID' for line in resp.split('\n'): line = line.strip() @@ -197,6 +219,7 @@ class bookInfoSpide(): ret=re.search(self.re_bn, line) if ret: sid = ret.group(1) + bkinfo[sid]['link'] = os.path.join(LINKPREF,sid) bkinfo[sid]['bookname'] = ret.group(2) bkinfo[sid]['img'] = ret.group(3) stat = 'RATE' @@ -226,7 +249,8 @@ class bookInfoSpide(): if stat=='ASIN': ret=re.search(self.re_asin, line) if ret: - sid = ret.group(1) + sid=ret.group(1) + bkinfo[sid]['link'] = os.path.join(LINKPREF,ret.group(1)) stat = 'IMG' continue elif stat=='IMG': @@ -261,25 +285,27 @@ class bookInfoSpide(): stat=='ASIN' continue - return bkinfo + return [mbkn, bkinfo] - def filter_spide_books(self, mbkn, mbkinfo): - """ mbkn - bookname to be spide + def filter_spide_book(self, mbkinfo): + """ mbkinfo: douban { - "25853071": { # sid - "bookname": "庆余年", + "庆余年": { + "link":"https://....25853071", + "bookname": "庆余年xxx", "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg", "rate": "8.0", "author": "猫腻" },...} amazon - "B07RN73425": { - "img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg", + "孟子": { + "link": "https://....B07RN73425", "bookname": "古典名著普及文库:孟子", - "author": "孙钦善", + "img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg", "rate": "3.9" + "author": "孙钦善", } """ @@ -289,18 +315,19 @@ class bookInfoSpide(): # f3/d3: mbkn and bookname different [f1,f2,f3] = [0,0,0] [d1,d2,d3] = [{},{},{}] - for k,v in mbkinfo.items(): + mbkn = mbkinfo[0] + for k,v in mbkinfo[1].items(): bkn = v['bookname'] - if len(v) == 4: + if len(v) == 5: if (not f1) and (mbkn in bkn): f1 = 1 - d1 = {k:v} + d1 = {mbkn:v} elif (not f1) and (not f2) and (bkn in mbkn): f2 = 1 - d2 = {k:v} + d2 = {mbkn:v} elif (not f3): f3 = 1 - d3 = {k:v} + d3 = {mbkn:v} else: continue else: continue @@ -312,7 +339,28 @@ class bookInfoSpide(): elif f3: return d3 - return 0 + return None + + def down_book_img(self, mbkinfo): + import os + import socket + from urllib.request import urlretrieve + + headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} + + for k,v in mbkinfo.items(): + link = v['img'] + if not os.path.exists(IMGPATH): os.mkdir(IMGPATH) + p=os.path.join(IMGPATH,link.split('/')[-1]) + + try: + img = requests.get(link, headers=headers) + if img.status_code == 200: + with open(p, 'wb') as fp: + fp.write(img.content) + except Exception as e: + print(e) + if __name__ == '__main__': @@ -322,9 +370,10 @@ if __name__ == '__main__': bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0] print(bkname) bkinfo = spide.grab_book_info(bkname) - filter_bkinfo = spide.filter_spide_books(bkname, bkinfo) + filter_bkinfo = spide.filter_spide_book(bkinfo) + if filter_bkinfo: spide.down_book_img(filter_bkinfo) logger.debug('================ {} ================'.format(bkname)) - #logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False)) + logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False)) logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False)) diff --git a/ttranslator.py b/ttranslator.py index 90b6176..13d9944 100644 --- a/ttranslator.py +++ b/ttranslator.py @@ -1,4 +1,6 @@ +# from webside not test, only for reference XXX + #coding=utf8 import random import requests