619 lines
19 KiB
Python
619 lines
19 KiB
Python
|
|
#########################################################
|
|
## @file : kman.py
|
|
## @desc : kindle note managerment tool
|
|
## @create : 2020/05/26
|
|
## @author : Chengan
|
|
## @email : douboer@gmail.com
|
|
#########################################################
|
|
|
|
import re
|
|
import os
|
|
import io
|
|
import json
|
|
import time
|
|
import logging
|
|
import platform
|
|
import subprocess
|
|
from collections import defaultdict
|
|
|
|
# data structure - use dict
|
|
'''
|
|
books =
|
|
{
|
|
"bookname_xxx": {
|
|
"author": "李",
|
|
"section1636": {
|
|
"content": "张",
|
|
"day": "2020年4月3日",
|
|
"meridiem": "下午",
|
|
"position": "311-311",
|
|
"time": "3:00:53",
|
|
"type": "HL",
|
|
"week": "星期五"
|
|
},
|
|
"section1651": {
|
|
"content": "治",
|
|
"day": "2020年4月3日",
|
|
"meridiem": "下午",
|
|
"position": "514",
|
|
"time": "3:43:50",
|
|
"type": "NT",
|
|
"week": "星期五"
|
|
},
|
|
"section1814": {
|
|
"content": null,
|
|
"day": "2020年4月12日",
|
|
"meridiem": "下午",
|
|
"position": "5186",
|
|
"time": "2:20:12",
|
|
"type": "BM",
|
|
"week": "星期日"
|
|
},
|
|
...
|
|
},
|
|
...
|
|
}
|
|
'''
|
|
|
|
# modi clippath for different os
|
|
SYS = 'WIN' if platform.system()=='Windows' else \
|
|
('LINUX' if platform.system()=='LINUX' else 'MAC')
|
|
|
|
# some constants
|
|
LASTLINE = '=========='
|
|
NTPREF = '--CG注:'
|
|
CLIPFN = 'My Clippings.txt'
|
|
CLIPPATH = './' # /Volumes/Kindle/documents/My\ Clippings.txt
|
|
#CLIPPATH = './tclip.txt'
|
|
OUTPREF = './clip'
|
|
DEBUG = 1 # 0 - INFO; 1 - DEBUG
|
|
LOG2FILE = 1 # 0 - to stdio; 1 - to file
|
|
LOGFILE = 'log'
|
|
DELIMITER= '|'
|
|
#HEADER = {0:'type',1:'bookname',2:'author',3:'position',4:'date',5:'content'}
|
|
|
|
# log info
|
|
logger = logging.getLogger()
|
|
#formatter = logging.Formatter
|
|
# ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
|
|
formatter = logging.Formatter('')
|
|
|
|
if LOG2FILE:
|
|
handler = logging.FileHandler(LOGFILE)
|
|
handler.setFormatter(formatter)
|
|
logger.addHandler(handler)
|
|
else:
|
|
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
|
|
|
|
if DEBUG:
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
#author & bookname info
|
|
au = re.compile(
|
|
r'''
|
|
^\ufeff*
|
|
(.+) \( #bookname
|
|
(.+)\) #author
|
|
''', flags=re.X )
|
|
|
|
# page & date info
|
|
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
|
|
da = re.compile(
|
|
r'''
|
|
\#
|
|
(\d+-{0,1}\d+) #group1 - page
|
|
.+
|
|
(笔记|标注|书签) #group2 - type
|
|
.+
|
|
(\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日
|
|
(星期.) #group4 - week
|
|
\s
|
|
(..) #group5 - pm/am
|
|
(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time
|
|
''', flags=re.X )
|
|
|
|
class kMan:
|
|
|
|
def __init__(self, parent=None):
|
|
self.hlnum = 0
|
|
self.ntnum = 0
|
|
self.refleshtime = '2020/10/10 10:00:00'
|
|
self.status = self.status_info()
|
|
|
|
def status_info(self):
|
|
s1 = u'Hightlight: {} Note: {} RefleshTime: {}'. \
|
|
format(self.hlnum,self.ntnum,self.refleshtime)
|
|
kp = self.get_kindle_path()
|
|
|
|
if not kp:
|
|
s2 = u'Disconnected ({})'.format(CLIPPATH+CLIPFN)
|
|
else:
|
|
with open(kp+'/system/version.txt' , 'r', encoding='utf8', errors='ignore') as f:
|
|
s2 = u'Connected ({}) version {}'.format(kp,f.read().strip())
|
|
|
|
return [s1,s2]
|
|
|
|
def parse_section(self,s,i):
|
|
"""parse section
|
|
|
|
Args:
|
|
s: section line list
|
|
i: section index
|
|
|
|
Returns:
|
|
dict like this:
|
|
d = { 'bookname':bookname,
|
|
bookname: {
|
|
'author':author
|
|
'0':{
|
|
'type':'HL',
|
|
'position':'123',
|
|
'day':'2020年5月26日',
|
|
'week':'星期二',
|
|
'meridiem':'PM',
|
|
'time':'10:26:31'
|
|
'content':content }}}
|
|
"""
|
|
# 1. highlight over the picture, the content(#3 line) is empty, only two lines
|
|
# 2. bookmark section only two lines
|
|
# 3. other not correct format < 2
|
|
if len(s)<=2:
|
|
return False
|
|
|
|
# parse #2 line
|
|
section = defaultdict(dict)
|
|
"""
|
|
authinfo = sec[0]
|
|
dateinfo = sec[1]
|
|
content = sec[2] if len(sec)==3 else None
|
|
"""
|
|
(authinfo, dateinfo, content) = \
|
|
(s[0], s[1], s[2] if len(s)==3 else None)
|
|
|
|
das = da.search(dateinfo)
|
|
# type of section
|
|
'''
|
|
STAT :
|
|
START - start line of section
|
|
BM - section is a bookmark
|
|
HL - section is a highlight
|
|
NT - section is a note
|
|
'''
|
|
tpy = ('HL' if das.group(2)=='标注' else \
|
|
('NT' if das.group(2)=='笔记' else 'BM'))
|
|
"""
|
|
pos = das.group(1)
|
|
day = das.group(3)
|
|
week = das.group(4)
|
|
pmam = das.group(5)
|
|
time = das.group(6)
|
|
"""
|
|
(pos, x, day, week, pmam, time) = das.groups()[0:6]
|
|
|
|
# parse #1 line
|
|
aus = au.search(authinfo)
|
|
bookname = aus.group(1).strip().replace(' ','')
|
|
author = aus.group(2).strip().replace(' ','')
|
|
section[bookname]['author'] = author
|
|
|
|
section['bookname'] = bookname
|
|
section[bookname][str(i)] = {
|
|
'type':tpy,
|
|
'position':pos,
|
|
'day':day,
|
|
'week':week,
|
|
'meridiem':pmam,
|
|
'time':time,
|
|
'content':content }
|
|
|
|
return section
|
|
|
|
def format_time(self,ds):
|
|
""" format date
|
|
Args:
|
|
ds: 2020年1月13日 星期一 上午 8:11:05
|
|
Return:
|
|
2020/1/13 20:11:05
|
|
"""
|
|
d = ds.split(' ')
|
|
res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
|
|
ymd = '/'.join(res.groups())
|
|
res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
|
|
tm = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)
|
|
|
|
return ymd+tm
|
|
|
|
def format_data(self,bks, ft='MD'):
|
|
""" format data for MD & CSV
|
|
|
|
Args:
|
|
bks: books dict
|
|
f: can be 'MD'/'CSV'
|
|
|
|
Return:
|
|
list [header, sections]
|
|
header and sections are lists
|
|
"""
|
|
hd =[] # header
|
|
secs =[] # content
|
|
DELIMITER = '|' if ft=='MD' else ','
|
|
|
|
hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
|
|
if ft=='MD':
|
|
hd.append(DELIMITER.join(['--' for i in range(5)]))
|
|
|
|
for kb,vb in bks.items():
|
|
author = vb['author']
|
|
for ks, vs in vb.items():
|
|
if ks in ['author', 'lines']: continue
|
|
secs.append(DELIMITER.join([vs['type'],kb,author, \
|
|
self.format_time(' '.join([vs['day'],vs['week'],\
|
|
vs['meridiem'],vs['time']])),vs['content']]))
|
|
|
|
return hd+secs
|
|
|
|
def format_out(self,bks, fnpref, ft='MD'):
|
|
"""format output and write to file
|
|
markdown format:
|
|
TYPE | bookname | author | marktime | content
|
|
--|--|--|--|--
|
|
xx|xx|xx|xx|xx
|
|
|
|
CSV format:
|
|
TYPE,bookname,author,marktime,content
|
|
xx,xx,xx,xx,xx
|
|
|
|
marktime: 20200403 PM 3:0:3 星期五
|
|
|
|
Args:
|
|
bks: books dict
|
|
f: can be 'MD'/'JSON'/'CSV'
|
|
|
|
Returns: special format of 'bks' dict
|
|
"""
|
|
|
|
suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
|
|
op = fnpref+suff[ft]
|
|
|
|
with open(op, 'w', encoding='utf8', errors='ignore') as fw:
|
|
if ft=='JSON':
|
|
fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
|
|
elif ft in ['MD','CSV']:
|
|
for s in self.format_data(bks, ft):
|
|
fw.write(s)
|
|
fw.write('\n')
|
|
else:
|
|
fw.write(json.dumps(bks)) # only for load back
|
|
|
|
def drop_duplicate(self,bks):
|
|
""" drop duplicated section
|
|
|
|
If I mark second time in same place, kindle will create two note,
|
|
so I need to remove the duplication record
|
|
|
|
Args:
|
|
bks: books dict
|
|
Return:
|
|
books remove duplicate sections
|
|
"""
|
|
[preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
|
|
for kb,vb in bks.items():
|
|
bks[kb]['lines'] = 0
|
|
# add copy() or throw RuntimeError: dictionary changed size during iteration
|
|
# reference - http://www.cocoachina.com/articles/89748
|
|
for ks, vs in vb.copy().items():
|
|
if ks in ['author', 'lines']: continue
|
|
bks[kb]['lines'] += 1
|
|
if (vs['content'] in prevs['content'] or \
|
|
prevs['content'] in vs['content']) and \
|
|
prevs['type'] == vs['type']:
|
|
bks[kb].pop(preks)
|
|
#if vs['content'] != prevs['content']:
|
|
# print('prevs',prevs['type'],prevs['content'])
|
|
# print(' vs', vs['type'], vs['content'])
|
|
|
|
preks = ks
|
|
prevs = vs
|
|
|
|
return bks
|
|
|
|
def get_bookname_num(self,bks):
|
|
""" get note number of booknames
|
|
Args:
|
|
bks: books dict
|
|
Return: dict {bookname:num,...}
|
|
"""
|
|
bksnum = defaultdict(dict)
|
|
nu = 0
|
|
for kb,vb in bks.items():
|
|
bksnum.setdefault(kb, 0)
|
|
for ks, vs in vb.copy().items():
|
|
if ks in ['author', 'lines']: continue
|
|
bksnum[kb] += 1
|
|
nu += 1
|
|
|
|
return [nu, bksnum]
|
|
|
|
def get_author_num(self,bks):
|
|
""" get note number of author
|
|
Args:
|
|
bks: books dict
|
|
Return: dict {bookname:num,...}
|
|
"""
|
|
bksnum = defaultdict(dict)
|
|
nu = 0
|
|
for kb,vb in bks.items():
|
|
for ks, vs in vb.copy().items():
|
|
if ks in ['author', 'lines']: continue
|
|
au = vb['author']
|
|
bksnum.setdefault(au, 0)
|
|
bksnum[au] += 1
|
|
nu += 1
|
|
|
|
return [nu, bksnum]
|
|
|
|
def filter_clips(self, bks, info=None, tp=0):
|
|
""" filter clips
|
|
Args:
|
|
bks: books dict
|
|
info: filter by bookname or author information
|
|
tp: type to be filter
|
|
0: root item clicked
|
|
1: bookname item clicked
|
|
2: author item clicked
|
|
Return: list:
|
|
[[Type,Bookname,Author,Position,Date,content],
|
|
[Type,Bookname,Author,Position,Date,content]
|
|
....]
|
|
"""
|
|
nbks = defaultdict(dict)
|
|
|
|
# do not filter
|
|
if tp==0: nbks = bks
|
|
|
|
# do filter
|
|
for kb, vb in bks.items():
|
|
if [info, tp] in ([kb, 1], [vb['author'], 2]):
|
|
nbks[kb] = vb
|
|
|
|
seclist = []
|
|
idx = 0
|
|
for kb, vb in nbks.items():
|
|
for ks,vs in vb.items():
|
|
if ks in ['author', 'lines']: continue
|
|
tm = self.format_time(' '.join([vs['day'],vs['week'], \
|
|
vs['meridiem'],vs['time']]))
|
|
nttype = '标注' if vs['type']=='HL' else '笔记'
|
|
seclist.append([nttype,kb,vb['author'],vs['position'],tm,vs['content']])
|
|
idx += 1
|
|
|
|
return seclist
|
|
|
|
def add_note_to_highlight(self,bks):
|
|
""" append note content to corresponding highlight
|
|
and remove NT sections
|
|
|
|
Args:
|
|
bks: books dict
|
|
Return:
|
|
changed books
|
|
"""
|
|
[preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
|
|
for kb,vb in bks.items():
|
|
for ks,vs in vb.copy().items():
|
|
if ks in ['author', 'lines']: continue
|
|
if [prevs['type'], vs['type']] == ['HL','NT']:
|
|
bks[kb][preks]['content'] += str(NTPREF+vs['content'])
|
|
bks[kb].pop(ks)
|
|
|
|
preks = ks
|
|
prevs = vs
|
|
|
|
return bks
|
|
|
|
def search_clip(self,bks, s, t='ALL', p='ALL'):
|
|
"""search clip, searching scope may be title/author/content
|
|
Args:
|
|
input: bks: books dict
|
|
s: key word
|
|
t: 'ALL'
|
|
'HL'
|
|
'BM'
|
|
'NT'
|
|
p: 'ALL'
|
|
'TITLE'
|
|
'AUTHOR'
|
|
'CONTENT'
|
|
Return:
|
|
[number of result , result dict]
|
|
"""
|
|
nbks = defaultdict(dict)
|
|
nu = 0
|
|
for kb,vb in bks.items():
|
|
nbks[kb]['lines'] = 0
|
|
for ks,vs in vb.copy().items():
|
|
if ks in ['author', 'lines']:
|
|
nbks[kb][ks] = vs
|
|
continue
|
|
if t in ['ALL', vs['type']]:
|
|
scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
|
|
'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
|
|
found = re.search(s, scopestr[p])
|
|
if found:
|
|
nbks[kb][ks] = vs
|
|
nbks[kb]['lines'] += 1
|
|
nu += 1
|
|
if nbks[kb]['lines']==0:
|
|
nbks.pop(kb)
|
|
|
|
return [nu,nbks]
|
|
|
|
# to be implement
|
|
def statistic(self,bks):
|
|
pass
|
|
|
|
def dict2json(self,d):
|
|
"""convert dict to json
|
|
Args: d is the dict
|
|
Return: json string
|
|
"""
|
|
jstr = json.dumps(d)
|
|
return jstr
|
|
|
|
def json2dict(self,jf):
|
|
"""convert dict to json
|
|
Args: jf is the file saved json string
|
|
Return: dict
|
|
"""
|
|
d = {}
|
|
with open(jf, 'r', encoding='utf8', errors='ignore') as f:
|
|
d=json.load(f)
|
|
return d
|
|
|
|
def get_kindle_path(self):
|
|
"""check and return kindle device path
|
|
Args:
|
|
Return:
|
|
if kindle connected, return path string of kindle device
|
|
else return false
|
|
"""
|
|
cmd = "wmic logicaldisk get name,volumename" if os.name=='nt'\
|
|
else ("ls /Volumes/Kindle" if os.name=='posix' else '')
|
|
|
|
# not test for windows & linux
|
|
with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, \
|
|
stderr=subprocess.PIPE, bufsize=-1) as s:
|
|
stream_stdout = io.TextIOWrapper(s.stdout, encoding='utf-8')
|
|
stream_stderr = io.TextIOWrapper(s.stderr, encoding='utf-8')
|
|
sout = str(stream_stdout.read())
|
|
#serr = str(stream_stderr.read())
|
|
#if sout: print('stdout {}'.format(sout))
|
|
#if serr: print('stderr {}'.format(serr))
|
|
if os.name == 'nt': # windows
|
|
for d in sout.split('\n'):
|
|
if 'Kindle' in d: return d.split('\s+')[0]
|
|
elif os.name == 'posix': # mac os
|
|
if sout: return('/Volumes/Kindle/')
|
|
else:
|
|
pass
|
|
|
|
"""
|
|
# will print error information on stdout
|
|
with os.popen(cmd) as s:
|
|
sout = s.read()
|
|
if os.name == 'nt': # windows
|
|
for d in sout.split('\n'):
|
|
if 'Kindle' in d: return d.split('\s+')[0]
|
|
elif os.name == 'posix': # mac os
|
|
if sout: return('/Volumes/Kindle')
|
|
else:
|
|
pass
|
|
"""
|
|
|
|
return False
|
|
|
|
def import_clips(self, fp=(CLIPPATH+CLIPFN)):
|
|
"""import clips from local file or kindle
|
|
4 lines for each section seperated with '======='
|
|
so read 4 lines before '======='
|
|
|
|
Args: fp - file path
|
|
Return: 0 - want to import kindle but kindle is not connected
|
|
books dict
|
|
"""
|
|
# check kindle by user just call get_kindle_path()
|
|
"""
|
|
if tp=='kindle':
|
|
kp = get_kindle_path()
|
|
if not kp: return 0
|
|
else: path = kp
|
|
else:
|
|
path = fn
|
|
"""
|
|
|
|
# loop to fill books dict
|
|
with open(fp, 'r', encoding='utf8', errors='ignore') as f:
|
|
bks = defaultdict(dict)
|
|
secd = defaultdict(dict)
|
|
sidx = 0
|
|
idx = 0
|
|
sec = []
|
|
for line in f.readlines():
|
|
line = line.strip()
|
|
if re.match(r'^\s*$',line): continue
|
|
idx += 1
|
|
|
|
if not re.search(LASTLINE,line):
|
|
# content more than 1 line
|
|
if idx>3:
|
|
sec[2] += str(' '+line)
|
|
#logger.debug('idx {} {}'.format(idx, sec[2]))
|
|
else:
|
|
sec.append(line)
|
|
#logger.debug('idx {} {}'.format(idx, sec[idx-1]))
|
|
else:
|
|
idx = 0
|
|
sidx += 1
|
|
|
|
# parsing section & fill data structure
|
|
secd = self.parse_section(sec,sidx)
|
|
|
|
if secd:
|
|
bn = secd['bookname']
|
|
tpy = secd[bn][str(sidx)]['type']
|
|
|
|
bks[bn]['author'] = secd[bn]['author']
|
|
bks[bn][str(sidx)] = secd[bn][str(sidx)]
|
|
|
|
# not add note to highlight content here,
|
|
# because NT maybe duplicated, we need remove duplication record before
|
|
"""
|
|
if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL':
|
|
bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
|
|
"""
|
|
if tpy=='HL': self.hlnum += 1
|
|
elif tpy=='NT': self.ntnum += 1
|
|
|
|
else: # BM or not correct format section
|
|
sidx -= 1
|
|
|
|
# initial section for next section loop
|
|
sec = []
|
|
|
|
self.refleshtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
|
|
|
return bks
|
|
|
|
if __name__=='__main__':
|
|
#books = defaultdict(dict)
|
|
km = kMan()
|
|
books = km.import_clips()
|
|
|
|
# remove duplication
|
|
km.drop_duplicate(books)
|
|
|
|
# test search note function
|
|
searchnote = km.search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
|
|
if searchnote[0] > 0: km.format_out(searchnote[1], 'searchcontent', ft='MD')
|
|
searchnote = km.search_clip(books, '经济', 'ALL', 'TITLE')
|
|
if searchnote[0] > 0: km.format_out(searchnote[1], 'searchtitle', ft='MD')
|
|
searchnote = km.search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
|
|
if searchnote[0] > 0: km.format_out(searchnote[1], 'searchauthor', ft='MD')
|
|
|
|
print(km.get_bookname_num(books))
|
|
print(km.get_author_num(books))
|
|
|
|
# add note content to hightlight, then delete note
|
|
km.add_note_to_highlight(books)
|
|
|
|
# test dict json convert
|
|
with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
|
|
fw.write(km.dict2json(books))
|
|
if km.json2dict('./xx')==books: print( 'test OK')
|
|
|
|
km.format_out(books, OUTPREF, ft='MD')
|
|
|
|
# print data with json format
|
|
logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))
|