kindle manager
@@ -91,9 +91,9 @@ b['1']['2'] = {'3':1} # OK
|
||||
- import function:
|
||||
- local **done**
|
||||
- kindle **done**
|
||||
- duokan
|
||||
- amazon
|
||||
- link to douban
|
||||
- duokan **doing**
|
||||
- amazon **doing**
|
||||
- link to douban and amazon **doing**
|
||||
- export function:
|
||||
- to [evernote](https://github.com/benhorvath/kindle2evernote/blob/master/kindle2evernote.py)
|
||||
- to web html format
|
||||
@@ -115,3 +115,5 @@ b['1']['2'] = {'3':1} # OK
|
||||
- chrome extension
|
||||
- clean (sdr)
|
||||
- sync modify to kindle device
|
||||
- compare parse html info with xpath & beautisoap & regex
|
||||
|
||||
|
||||
BIN
downimg/71GBLkJWDTL._AC_UY218_.jpg
Normal file
|
After Width: | Height: | Size: 5.8 KiB |
BIN
downimg/71Zs1nSJrgL._AC_UY218_.jpg
Normal file
|
After Width: | Height: | Size: 9.7 KiB |
BIN
downimg/p2575362797.jpg
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
downimg/s1026024.jpg
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
downimg/s1670329.jpg
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
downimg/s1738643.jpg
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
downimg/s24399718.jpg
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
downimg/s26303695.jpg
Normal file
|
After Width: | Height: | Size: 13 KiB |
BIN
downimg/s2689149.jpg
Normal file
|
After Width: | Height: | Size: 22 KiB |
BIN
downimg/s27246465.jpg
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
downimg/s27276912.jpg
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
downimg/s27653114.jpg
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
downimg/s28283792.jpg
Normal file
|
After Width: | Height: | Size: 9.7 KiB |
BIN
downimg/s29357535.jpg
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
downimg/s29399593.jpg
Normal file
|
After Width: | Height: | Size: 8.1 KiB |
BIN
downimg/s29581756.jpg
Normal file
|
After Width: | Height: | Size: 8.4 KiB |
BIN
downimg/s29811329.jpg
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
downimg/s29841565.jpg
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
downimg/s29879195.jpg
Normal file
|
After Width: | Height: | Size: 31 KiB |
BIN
downimg/s33314966.jpg
Normal file
|
After Width: | Height: | Size: 27 KiB |
BIN
downimg/s33551591.jpg
Normal file
|
After Width: | Height: | Size: 40 KiB |
BIN
downimg/s33559469.jpg
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
downimg/s33587329.jpg
Normal file
|
After Width: | Height: | Size: 15 KiB |
BIN
downimg/s3948396.jpg
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
downimg/s4052388.jpg
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
downimg/s4386858.jpg
Normal file
|
After Width: | Height: | Size: 32 KiB |
BIN
downimg/s4397638.jpg
Normal file
|
After Width: | Height: | Size: 33 KiB |
BIN
downimg/s5641654.jpg
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
downimg/s6979148.jpg
Normal file
|
After Width: | Height: | Size: 35 KiB |
BIN
downimg/s7046197.jpg
Normal file
|
After Width: | Height: | Size: 15 KiB |
BIN
downimg/s8488837.jpg
Normal file
|
After Width: | Height: | Size: 15 KiB |
18
kmanapp.py
@@ -11,6 +11,7 @@ import sys
|
||||
import os
|
||||
from time import sleep
|
||||
import pandas as pd
|
||||
import threading
|
||||
|
||||
from PySide2.QtWidgets import *
|
||||
|
||||
@@ -23,6 +24,7 @@ from PySide2.QtGui import (QBrush, QColor, QConicalGradient, QCursor, QFont,
|
||||
|
||||
from mainwindow import Ui_MainWindow
|
||||
from kman import *
|
||||
from parseweb import *
|
||||
|
||||
# import binary resource file(kmanapp_rc.py)
|
||||
import kmanapp_rc
|
||||
@@ -63,6 +65,8 @@ class kmanWindow(QMainWindow):
|
||||
self.tree_selected = 'note_root'
|
||||
|
||||
self.km = kMan()
|
||||
self.spide = bookInfoSpide()
|
||||
|
||||
# initial check order:
|
||||
# 1. backup file bk.data ->
|
||||
# 2. kindle(My Clippings.txt) ->
|
||||
@@ -520,6 +524,16 @@ class kmanWindow(QMainWindow):
|
||||
# stop check thread
|
||||
self.flag = False
|
||||
|
||||
def grab_all_book_info(self):
|
||||
|
||||
for bkname in self.books_data.keys():
|
||||
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
|
||||
print(bkname)
|
||||
bkinfo = self.spide.grab_book_info(bkname)
|
||||
filter_bkinfo = self.spide.filter_spide_book(bkinfo)
|
||||
if filter_bkinfo:
|
||||
self.spide.down_book_img(filter_bkinfo)
|
||||
|
||||
# thanks Martin Fitzpatrick ^_^
|
||||
# https://www.learnpyqt.com/courses/model-views/qtableview-modelviews-numpy-pandas/
|
||||
class nTableModel(QAbstractTableModel):
|
||||
@@ -562,6 +576,10 @@ if __name__ == "__main__":
|
||||
#kmw.showFullScreen()
|
||||
kmw.show()
|
||||
|
||||
trd = threading.Thread(target=kmw.grab_all_book_info)
|
||||
trd.setDaemon(True)
|
||||
trd.start()
|
||||
|
||||
# loop check kindle is connected or not
|
||||
# BUG to be implement XXXX
|
||||
"""
|
||||
|
||||
99
parseweb.py
@@ -22,17 +22,21 @@ logger = logging.getLogger()
|
||||
logger.addHandler(logging.FileHandler('log'))
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
spidetp = 1 # 0 - douban 1- amazon
|
||||
ISDOUBAN = 1
|
||||
IMGPATH = './downimg'
|
||||
LINKPREF = 'https://book.douban.com/subject/' \
|
||||
if ISDOUBAN else 'https://www.amazon.cn/s?k='
|
||||
|
||||
mheaders = {
|
||||
'Host': 'www.douban.com',
|
||||
'Referer': 'http://www.douban.com',
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
|
||||
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
|
||||
}
|
||||
#"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
|
||||
|
||||
mparams = {}
|
||||
murl = ""
|
||||
if spidetp==0:
|
||||
if ISDOUBAN==1:
|
||||
mparams['Host']='www.douban.com',
|
||||
mparams['search_text'] = 'bkname_xxx'
|
||||
mparams['cat']='1001'
|
||||
@@ -141,7 +145,7 @@ class bookInfoSpide():
|
||||
|
||||
|
||||
[re_bn,re_bn,re_rate,re_norate,re_author,re_end] = [None,None,None,None,None,None]
|
||||
if spidetp==0:
|
||||
if ISDOUBAN==1:
|
||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
|
||||
re_bn = re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
|
||||
re_rate = re.compile(r'''class=\"rating_nums\">(.+?)<''')
|
||||
@@ -164,6 +168,7 @@ class bookInfoSpide():
|
||||
"""mbkn - bookname to be spided
|
||||
return: {
|
||||
"25853071": { # sid
|
||||
"link":"https://....xxxxx"
|
||||
"bookname": "庆余年",
|
||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||
"rate": "8.0",
|
||||
@@ -171,15 +176,32 @@ class bookInfoSpide():
|
||||
},...}
|
||||
"""
|
||||
|
||||
if spidetp==0: #douban
|
||||
if ISDOUBAN==1: #douban
|
||||
mparams['search_text'] = mbkn
|
||||
else: #amazon
|
||||
mparams['k'] = mbkn
|
||||
|
||||
r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||||
try:
|
||||
s = requests.Session()
|
||||
s.header = mheaders
|
||||
s.params = mparams
|
||||
r = s.get(murl)
|
||||
#r = requests.get( url=murl, headers=mheaders, params=mparams)
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
print('ConnectionError -- please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
|
||||
except requests.exceptions.ChunkedEncodingError:
|
||||
print('ChunkedEncodingError -- please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
|
||||
except:
|
||||
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
|
||||
time.sleep(3)
|
||||
|
||||
if r.status_code != 200:
|
||||
raise Exception("请求失败")
|
||||
print('grab book {} info from webside failure'.format(mbkn))
|
||||
|
||||
bkinfo = defaultdict(dict)
|
||||
sid = None
|
||||
@@ -187,7 +209,7 @@ class bookInfoSpide():
|
||||
|
||||
resp = r.text
|
||||
|
||||
if spidetp==0:
|
||||
if ISDOUBAN==1:
|
||||
stat = 'SID'
|
||||
for line in resp.split('\n'):
|
||||
line = line.strip()
|
||||
@@ -197,6 +219,7 @@ class bookInfoSpide():
|
||||
ret=re.search(self.re_bn, line)
|
||||
if ret:
|
||||
sid = ret.group(1)
|
||||
bkinfo[sid]['link'] = os.path.join(LINKPREF,sid)
|
||||
bkinfo[sid]['bookname'] = ret.group(2)
|
||||
bkinfo[sid]['img'] = ret.group(3)
|
||||
stat = 'RATE'
|
||||
@@ -226,7 +249,8 @@ class bookInfoSpide():
|
||||
if stat=='ASIN':
|
||||
ret=re.search(self.re_asin, line)
|
||||
if ret:
|
||||
sid = ret.group(1)
|
||||
sid=ret.group(1)
|
||||
bkinfo[sid]['link'] = os.path.join(LINKPREF,ret.group(1))
|
||||
stat = 'IMG'
|
||||
continue
|
||||
elif stat=='IMG':
|
||||
@@ -261,25 +285,27 @@ class bookInfoSpide():
|
||||
stat=='ASIN'
|
||||
continue
|
||||
|
||||
return bkinfo
|
||||
return [mbkn, bkinfo]
|
||||
|
||||
def filter_spide_books(self, mbkn, mbkinfo):
|
||||
""" mbkn - bookname to be spide
|
||||
def filter_spide_book(self, mbkinfo):
|
||||
"""
|
||||
mbkinfo:
|
||||
douban
|
||||
{
|
||||
"25853071": { # sid
|
||||
"bookname": "庆余年",
|
||||
"庆余年": {
|
||||
"link":"https://....25853071",
|
||||
"bookname": "庆余年xxx",
|
||||
"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
|
||||
"rate": "8.0",
|
||||
"author": "猫腻"
|
||||
},...}
|
||||
amazon
|
||||
"B07RN73425": {
|
||||
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
|
||||
"孟子": {
|
||||
"link": "https://....B07RN73425",
|
||||
"bookname": "古典名著普及文库:孟子",
|
||||
"author": "孙钦善",
|
||||
"img": "https://images-cn.ssl-images-amazon.com/images/I/511vbVrhIBL._AC_UY218_.jpg",
|
||||
"rate": "3.9"
|
||||
"author": "孙钦善",
|
||||
}
|
||||
|
||||
"""
|
||||
@@ -289,18 +315,19 @@ class bookInfoSpide():
|
||||
# f3/d3: mbkn and bookname different
|
||||
[f1,f2,f3] = [0,0,0]
|
||||
[d1,d2,d3] = [{},{},{}]
|
||||
for k,v in mbkinfo.items():
|
||||
mbkn = mbkinfo[0]
|
||||
for k,v in mbkinfo[1].items():
|
||||
bkn = v['bookname']
|
||||
if len(v) == 4:
|
||||
if len(v) == 5:
|
||||
if (not f1) and (mbkn in bkn):
|
||||
f1 = 1
|
||||
d1 = {k:v}
|
||||
d1 = {mbkn:v}
|
||||
elif (not f1) and (not f2) and (bkn in mbkn):
|
||||
f2 = 1
|
||||
d2 = {k:v}
|
||||
d2 = {mbkn:v}
|
||||
elif (not f3):
|
||||
f3 = 1
|
||||
d3 = {k:v}
|
||||
d3 = {mbkn:v}
|
||||
else: continue
|
||||
else:
|
||||
continue
|
||||
@@ -312,7 +339,28 @@ class bookInfoSpide():
|
||||
elif f3:
|
||||
return d3
|
||||
|
||||
return 0
|
||||
return None
|
||||
|
||||
def down_book_img(self, mbkinfo):
|
||||
import os
|
||||
import socket
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
|
||||
|
||||
for k,v in mbkinfo.items():
|
||||
link = v['img']
|
||||
if not os.path.exists(IMGPATH): os.mkdir(IMGPATH)
|
||||
p=os.path.join(IMGPATH,link.split('/')[-1])
|
||||
|
||||
try:
|
||||
img = requests.get(link, headers=headers)
|
||||
if img.status_code == 200:
|
||||
with open(p, 'wb') as fp:
|
||||
fp.write(img.content)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -322,9 +370,10 @@ if __name__ == '__main__':
|
||||
bkname = re.split(r'[\((\-\::_\s]',bkname.strip())[0]
|
||||
print(bkname)
|
||||
bkinfo = spide.grab_book_info(bkname)
|
||||
filter_bkinfo = spide.filter_spide_books(bkname, bkinfo)
|
||||
filter_bkinfo = spide.filter_spide_book(bkinfo)
|
||||
if filter_bkinfo: spide.down_book_img(filter_bkinfo)
|
||||
|
||||
logger.debug('================ {} ================'.format(bkname))
|
||||
#logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
|
||||
logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
|
||||
logger.debug(json.dumps(filter_bkinfo,indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
|
||||
# from webside not test, only for reference XXX
|
||||
|
||||
#coding=utf8
|
||||
import random
|
||||
import requests
|
||||
|
||||