kindle manager
@@ -91,9 +91,9 @@ b['1']['2'] = {'3':1} # OK
|
|||||||
- import function:
|
- import function:
|
||||||
- local **done**
|
- local **done**
|
||||||
- kindle **done**
|
- kindle **done**
|
||||||
- duokan
|
- duokan **doing**
|
||||||
- amazon
|
- amazon **doing**
|
||||||
- link to douban
|
- link to douban and amazon **doing**
|
||||||
- export function:
|
- export function:
|
||||||
- to [evernote](https://github.com/benhorvath/kindle2evernote/blob/master/kindle2evernote.py)
|
- to [evernote](https://github.com/benhorvath/kindle2evernote/blob/master/kindle2evernote.py)
|
||||||
- to web html format
|
- to web html format
|
||||||
@@ -115,3 +115,5 @@ b['1']['2'] = {'3':1} # OK
|
|||||||
- chrome extension
|
- chrome extension
|
||||||
- clean (sdr)
|
- clean (sdr)
|
||||||
- sync modify to kindle device
|
- sync modify to kindle device
|
||||||
|
- compare parse html info with xpath & beautisoap & regex
|
||||||
|
|
||||||
|
|||||||
BIN
downimg/71GBLkJWDTL._AC_UY218_.jpg
Normal file
|
After Width: | Height: | Size: 5.8 KiB |
BIN
downimg/71Zs1nSJrgL._AC_UY218_.jpg
Normal file
|
After Width: | Height: | Size: 9.7 KiB |
BIN
downimg/p2575362797.jpg
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
downimg/s1026024.jpg
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
downimg/s1670329.jpg
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
downimg/s1738643.jpg
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
downimg/s24399718.jpg
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
downimg/s26303695.jpg
Normal file
|
After Width: | Height: | Size: 13 KiB |
BIN
downimg/s2689149.jpg
Normal file
|
After Width: | Height: | Size: 22 KiB |
BIN
downimg/s27246465.jpg
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
downimg/s27276912.jpg
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
downimg/s27653114.jpg
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
downimg/s28283792.jpg
Normal file
|
After Width: | Height: | Size: 9.7 KiB |
BIN
downimg/s29357535.jpg
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
downimg/s29399593.jpg
Normal file
|
After Width: | Height: | Size: 8.1 KiB |
BIN
downimg/s29581756.jpg
Normal file
|
After Width: | Height: | Size: 8.4 KiB |
BIN
downimg/s29811329.jpg
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
downimg/s29841565.jpg
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
downimg/s29879195.jpg
Normal file
|
After Width: | Height: | Size: 31 KiB |
BIN
downimg/s33314966.jpg
Normal file
|
After Width: | Height: | Size: 27 KiB |
BIN
downimg/s33551591.jpg
Normal file
|
After Width: | Height: | Size: 40 KiB |
BIN
downimg/s33559469.jpg
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
downimg/s33587329.jpg
Normal file
|
After Width: | Height: | Size: 15 KiB |
BIN
downimg/s3948396.jpg
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
downimg/s4052388.jpg
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
downimg/s4386858.jpg
Normal file
|
After Width: | Height: | Size: 32 KiB |
BIN
downimg/s4397638.jpg
Normal file
|
After Width: | Height: | Size: 33 KiB |
BIN
downimg/s5641654.jpg
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
downimg/s6979148.jpg
Normal file
|
After Width: | Height: | Size: 35 KiB |
BIN
downimg/s7046197.jpg
Normal file
|
After Width: | Height: | Size: 15 KiB |
BIN
downimg/s8488837.jpg
Normal file
|
After Width: | Height: | Size: 15 KiB |
18
kmanapp.py
@@ -11,6 +11,7 @@ import sys
|
|||||||
import os
|
import os
|
||||||
from time import sleep
|
from time import sleep
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import threading
|
||||||
|
|
||||||
from PySide2.QtWidgets import *
|
from PySide2.QtWidgets import *
|
||||||
|
|
||||||
@@ -23,6 +24,7 @@ from PySide2.QtGui import (QBrush, QColor, QConicalGradient, QCursor, QFont,
|
|||||||
|
|
||||||
from mainwindow import Ui_MainWindow
|
from mainwindow import Ui_MainWindow
|
||||||
from kman import *
|
from kman import *
|
||||||
|
from parseweb import *
|
||||||
|
|
||||||
# import binary resource file(kmanapp_rc.py)
|
# import binary resource file(kmanapp_rc.py)
|
||||||
import kmanapp_rc
|
import kmanapp_rc
|
||||||
@@ -63,6 +65,8 @@ class kmanWindow(QMainWindow):
|
|||||||
self.tree_selected = 'note_root'
|
self.tree_selected = 'note_root'
|
||||||
|
|
||||||
self.km = kMan()
|
self.km = kMan()
|
||||||
|
self.spide = bookInfoSpide()
|
||||||
|
|
||||||
# initial check order:
|
# initial check order:
|
||||||
# 1. backup file bk.data ->
|
# 1. backup file bk.data ->
|
||||||
# 2. kindle(My Clippings.txt) ->
|
# 2. kindle(My Clippings.txt) ->
|
||||||
@@ -520,6 +524,16 @@ class kmanWindow(QMainWindow):
|
|||||||
# stop check thread
|
# stop check thread
|
||||||
self.flag = False
|
self.flag = False
|
||||||
|
|
||||||
|
def grab_all_book_info(self):
|
||||||
|
|
||||||
|
for bkname in self.books_data.keys():
|
||||||
|
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
|
||||||
|
print(bkname)
|
||||||
|
bkinfo = self.spide.grab_book_info(bkname)
|
||||||
|
filter_bkinfo = self.spide.filter_spide_book(bkinfo)
|
||||||
|
if filter_bkinfo:
|
||||||
|
self.spide.down_book_img(filter_bkinfo)
|
||||||
|
|
||||||
# thanks Martin Fitzpatrick ^_^
|
# thanks Martin Fitzpatrick ^_^
|
||||||
# https://www.learnpyqt.com/courses/model-views/qtableview-modelviews-numpy-pandas/
|
# https://www.learnpyqt.com/courses/model-views/qtableview-modelviews-numpy-pandas/
|
||||||
class nTableModel(QAbstractTableModel):
|
class nTableModel(QAbstractTableModel):
|
||||||
@@ -562,6 +576,10 @@ if __name__ == "__main__":
|
|||||||
#kmw.showFullScreen()
|
#kmw.showFullScreen()
|
||||||
kmw.show()
|
kmw.show()
|
||||||
|
|
||||||
|
trd = threading.Thread(target=kmw.grab_all_book_info)
|
||||||
|
trd.setDaemon(True)
|
||||||
|
trd.start()
|
||||||
|
|
||||||
# loop check kindle is connected or not
|
# loop check kindle is connected or not
|
||||||
# BUG to be implement XXXX
|
# BUG to be implement XXXX
|
||||||
"""
|
"""
|
||||||
|
|||||||
99
parseweb.py
@@ -22,17 +22,21 @@ logger = logging.getLogger()
|
|||||||
logger.addHandler(logging.FileHandler('log'))
|
logger.addHandler(logging.FileHandler('log'))
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
spidetp = 1 # 0 - douban 1- amazon
|
ISDOUBAN = 1
|
||||||
|
IMGPATH = './downimg'
|
||||||
|
LINKPREF = 'https://book.douban.com/subject/' \
|
||||||
|
if ISDOUBAN else 'https://www.amazon.cn/s?k='
|
||||||
|
|
||||||
mheaders = {
|
mheaders = {
|
||||||
'Host': 'www.douban.com',
|
'Host': 'www.douban.com',
|
||||||
'Referer': 'http://www.douban.com',
|
'Referer': 'http://www.douban.com',
|
||||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
|
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
|
||||||
}
|
}
|
||||||
|
#"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
|
||||||
|
|
||||||
mparams = {}
|
mparams = {}
|
||||||
murl = ""
|
murl = ""
|
||||||
if spidetp==0:
|
if ISDOUBAN==1:
|
||||||
mparams['Host']='www.douban.com',
|
mparams['Host']='www.douban.com',
|
||||||
mparams['search_text'] = 'bkname_xxx'
|
mparams['search_text'] = 'bkname_xxx'
|
||||||
mparams['cat']='1001'
|
mparams['cat']='1001'
|
||||||
@@ -141,7 +145,7 @@ class bookInfoSpide():
|
|||||||
|
|
||||||
|
|
||||||
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
|
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
|
||||||
if spidetp==0:
|
if ISDOUBAN==1:
|
||||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
||||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
||||||
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
||||||
@@ -164,6 +168,7 @@ class bookInfoSpide():
|
|||||||
"""mbkn - bookname to be spided
|
"""mbkn - bookname to be spided
|
||||||
return: {
|
return: {
|
||||||
"25853071": { # sid
|
"25853071": { # sid
|
||||||
|
"link":"https://....xxxxx"
|
||||||
"bookname": "庆余年",
|
"bookname": "庆余年",
|
||||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||||
"rate": "8.0",
|
"rate": "8.0",
|
||||||
@@ -171,15 +176,32 @@ class bookInfoSpide():
|
|||||||
},...}
|
},...}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if spidetp==0: #douban
|
if ISDOUBAN==1: #douban
|
||||||
mparams['search_text'] = mbkn
|
mparams['search_text'] = mbkn
|
||||||
else: #amazon
|
else: #amazon
|
||||||
mparams['k'] = mbkn
|
mparams['k'] = mbkn
|
||||||
|
|
||||||
r = requests.get( url=murl, headers=mheaders, params=mparams)
|
try:
|
||||||
|
s = requests.Session()
|
||||||
|
s.header = mheaders
|
||||||
|
s.params = mparams
|
||||||
|
r = s.get(murl)
|
||||||
|
#r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||||||
|
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
print('ConnectionError -- please wait 3 seconds')
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
except requests.exceptions.ChunkedEncodingError:
|
||||||
|
print('ChunkedEncodingError -- please wait 3 seconds')
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
except:
|
||||||
|
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise Exception("请求失败")
|
print('grab book {} info from webside failure'.format(mbkn))
|
||||||
|
|
||||||
bkinfo = defaultdict(dict)
|
bkinfo = defaultdict(dict)
|
||||||
sid = None
|
sid = None
|
||||||
@@ -187,7 +209,7 @@ class bookInfoSpide():
|
|||||||
|
|
||||||
resp = r.text
|
resp = r.text
|
||||||
|
|
||||||
if spidetp==0:
|
if ISDOUBAN==1:
|
||||||
stat = 'SID'
|
stat = 'SID'
|
||||||
for line in resp.split('\n'):
|
for line in resp.split('\n'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
@@ -197,6 +219,7 @@ class bookInfoSpide():
|
|||||||
ret=re.search(self.re_bn, line)
|
ret=re.search(self.re_bn, line)
|
||||||
if ret:
|
if ret:
|
||||||
sid = ret.group(1)
|
sid = ret.group(1)
|
||||||
|
bkinfo[sid]['link'] = os.path.join(LINKPREF,sid)
|
||||||
bkinfo[sid]['bookname'] = ret.group(2)
|
bkinfo[sid]['bookname'] = ret.group(2)
|
||||||
bkinfo[sid]['img'] = ret.group(3)
|
bkinfo[sid]['img'] = ret.group(3)
|
||||||
stat = 'RATE'
|
stat = 'RATE'
|
||||||
@@ -226,7 +249,8 @@ class bookInfoSpide():
|
|||||||
if stat=='ASIN':
|
if stat=='ASIN':
|
||||||
ret=re.search(self.re_asin, line)
|
ret=re.search(self.re_asin, line)
|
||||||
if ret:
|
if ret:
|
||||||
sid = ret.group(1)
|
sid=ret.group(1)
|
||||||
|
bkinfo[sid]['link'] = os.path.join(LINKPREF,ret.group(1))
|
||||||
stat = 'IMG'
|
stat = 'IMG'
|
||||||
continue
|
continue
|
||||||
elif stat=='IMG':
|
elif stat=='IMG':
|
||||||
@@ -261,25 +285,27 @@ class bookInfoSpide():
|
|||||||
stat=='ASIN'
|
stat=='ASIN'
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return bkinfo
|
return [mbkn, bkinfo]
|
||||||
|
|
||||||
def filter_spide_books(self, mbkn, mbkinfo):
|
def filter_spide_book(self, mbkinfo):
|
||||||
""" mbkn - bookname to be spide
|
"""
|
||||||
mbkinfo:
|
mbkinfo:
|
||||||
douban
|
douban
|
||||||
{
|
{
|
||||||
"25853071": { # sid
|
"庆余年": {
|
||||||
"bookname": "庆余年",
|
"link":"https://....25853071",
|
||||||
|
"bookname": "庆余年xxx",
|
||||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||||
"rate": "8.0",
|
"rate": "8.0",
|
||||||
"author": "猫腻"
|
"author": "猫腻"
|
||||||
},...}
|
},...}
|
||||||
amazon
|
amazon
|
||||||
"B07RN73425": {
|
"孟子": {
|
||||||
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
|
"link": "https://....B07RN73425",
|
||||||
"bookname": "古典名著普及文库:孟子",
|
"bookname": "古典名著普及文库:孟子",
|
||||||
"author": "孙钦善",
|
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
|
||||||
"rate": "3.9"
|
"rate": "3.9"
|
||||||
|
"author": "孙钦善",
|
||||||
}
|
}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -289,18 +315,19 @@ class bookInfoSpide():
|
|||||||
# f3/d3: mbkn and bookname different
|
# f3/d3: mbkn and bookname different
|
||||||
[f1,f2,f3] = [0,0,0]
|
[f1,f2,f3] = [0,0,0]
|
||||||
[d1,d2,d3] = [{},{},{}]
|
[d1,d2,d3] = [{},{},{}]
|
||||||
for k,v in mbkinfo.items():
|
mbkn = mbkinfo[0]
|
||||||
|
for k,v in mbkinfo[1].items():
|
||||||
bkn = v['bookname']
|
bkn = v['bookname']
|
||||||
if len(v) == 4:
|
if len(v) == 5:
|
||||||
if (not f1) and (mbkn in bkn):
|
if (not f1) and (mbkn in bkn):
|
||||||
f1 = 1
|
f1 = 1
|
||||||
d1 = {k:v}
|
d1 = {mbkn:v}
|
||||||
elif (not f1) and (not f2) and (bkn in mbkn):
|
elif (not f1) and (not f2) and (bkn in mbkn):
|
||||||
f2 = 1
|
f2 = 1
|
||||||
d2 = {k:v}
|
d2 = {mbkn:v}
|
||||||
elif (not f3):
|
elif (not f3):
|
||||||
f3 = 1
|
f3 = 1
|
||||||
d3 = {k:v}
|
d3 = {mbkn:v}
|
||||||
else: continue
|
else: continue
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
@@ -312,7 +339,28 @@ class bookInfoSpide():
|
|||||||
elif f3:
|
elif f3:
|
||||||
return d3
|
return d3
|
||||||
|
|
||||||
return 0
|
return None
|
||||||
|
|
||||||
|
def down_book_img(self, mbkinfo):
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
from urllib.request import urlretrieve
|
||||||
|
|
||||||
|
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
|
||||||
|
|
||||||
|
for k,v in mbkinfo.items():
|
||||||
|
link = v['img']
|
||||||
|
if not os.path.exists(IMGPATH): os.mkdir(IMGPATH)
|
||||||
|
p=os.path.join(IMGPATH,link.split('/')[-1])
|
||||||
|
|
||||||
|
try:
|
||||||
|
img = requests.get(link, headers=headers)
|
||||||
|
if img.status_code == 200:
|
||||||
|
with open(p, 'wb') as fp:
|
||||||
|
fp.write(img.content)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
@@ -322,9 +370,10 @@ if __name__ == '__main__':
|
|||||||
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
|
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
|
||||||
print(bkname)
|
print(bkname)
|
||||||
bkinfo = spide.grab_book_info(bkname)
|
bkinfo = spide.grab_book_info(bkname)
|
||||||
filter_bkinfo = spide.filter_spide_books(bkname, bkinfo)
|
filter_bkinfo = spide.filter_spide_book(bkinfo)
|
||||||
|
if filter_bkinfo: spide.down_book_img(filter_bkinfo)
|
||||||
|
|
||||||
logger.debug('================ {} ================'.format(bkname))
|
logger.debug('================ {} ================'.format(bkname))
|
||||||
#logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
|
logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
|
||||||
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))
|
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
|
|
||||||
|
# from webside not test, only for reference XXX
|
||||||
|
|
||||||
#coding=utf8
|
#coding=utf8
|
||||||
import random
|
import random
|
||||||
import requests
|
import requests
|
||||||
|
|||||||