kindle manager

This commit is contained in:
gavin
2020-06-06 08:43:34 +08:00
parent c0bfa52fc3
commit 722757dadd
8 changed files with 498 additions and 358 deletions

690
kman.py
View File

@@ -8,7 +8,9 @@
#########################################################
import re
import os
import json
import time
import logging
import platform
from collections import defaultdict
@@ -107,347 +109,409 @@ r'''
(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time
''', flags=re.X )
def parse_section(s,i):
"""parse section
Args:
s: section line list
i: section index
class kMan:
def __init__(self, parent=None):
self.hlnum = 0
self.ntnum = 0
self.refleshtime = '2020/10/10 10:00:00'
self.status = self.status_info()
def status_info(self):
s1 = u'Hightlight: {} Note: {} RefleshTime: {}'. \
format(self.hlnum,self.ntnum,self.refleshtime)
kp = self.get_kindle_path()
if not kp:
s2 = u'Disconnected'
else:
with open(kp+'/system/version.txt' , 'r', encoding='utf8', errors='ignore') as f:
s2 = u'Connected ({}) version {}'.format(kp,f.read().strip())
return [s1,s2]
def parse_section(self,s,i):
"""parse section
Args:
s: section line list
i: section index
Returns:
dict like this:
d = { 'bookname':bookname,
bookname: {
'author':author
'0':{
'type':'HL',
'position':'123',
'day':'2020年5月26日',
'week':'星期二',
'meridiem':'PM',
'time':'10:26:31'
'content':content }}}
"""
# 1. highlight over the picture, the content(#3 line) is empty, only two lines
# 2. bookmark section only two lines
# 3. other not correct format < 2
if len(s)<=2:
return False
# parse #2 line
section = defaultdict(dict)
"""
authinfo = sec[0]
dateinfo = sec[1]
content = sec[2] if len(sec)==3 else None
"""
(authinfo, dateinfo, content) = \
(s[0], s[1], s[2] if len(s)==3 else None)
das = da.search(dateinfo)
# type of section
'''
STAT :
START - start line of section
BM - section is a bookmark
HL - section is a highlight
NT - section is a note
'''
tpy = ('HL' if das.group(2)=='标注' else \
('NT' if das.group(2)=='笔记' else 'BM'))
"""
pos = das.group(1)
day = das.group(3)
week = das.group(4)
pmam = das.group(5)
time = das.group(6)
"""
(pos, x, day, week, pmam, time) = das.groups()[0:6]
# parse #1 line
aus = au.search(authinfo)
bookname = aus.group(1)
author = aus.group(2)
section[bookname]['author'] = author
section['bookname'] = bookname
section[bookname][str(i)] = {
'type':tpy,
'position':pos,
'day':day,
'week':week,
'meridiem':pmam,
'time':time,
'content':content }
return section
def format_time(self,ds):
""" format date
Args:
ds: 2020年1月13日 星期一 上午 8:11:05
Return:
2020/1/13 20:11:05
"""
d = ds.split(' ')
res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
ymd = '/'.join(res.groups())
res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
tm = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)
return ymd+tm
def format_data(self,bks, ft='MD'):
""" format data for MD & CSV
Args:
bks: books dict
f: can be 'MD'/'CSV'
Return:
list [header, sections]
header and sections are lists
"""
hd =[] # header
secs =[] # content
DELIMITER = '|' if ft=='MD' else ','
hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
if ft=='MD':
hd.append(DELIMITER.join(['--' for i in range(5)]))
for kb,vb in bks.items():
author = vb['author']
for ks, vs in vb.items():
if ks in ['author', 'lines']: continue
secs.append(DELIMITER.join([vs['type'],kb,author, \
self.format_time(' '.join([vs['day'],vs['week'],\
vs['meridiem'],vs['time']])),vs['content']]))
return hd+secs
def format_out(self,bks, fnpref, ft='MD'):
"""format output and write to file
markdown format:
TYPE | bookname | author | marktime | content
--|--|--|--|--
xx|xx|xx|xx|xx
CSV format:
TYPE,bookname,author,marktime,content
xx,xx,xx,xx,xx
marktime: 20200403 PM 3:0:3 星期五
Args:
bks: books dict
f: can be 'MD'/'JSON'/'CSV'
Returns: special format of 'bks' dict
"""
suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
op = fnpref+suff[ft]
with open(op, 'w', encoding='utf8', errors='ignore') as fw:
if ft=='JSON':
fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
elif ft in ['MD','CSV']:
for s in self.format_data(bks, ft):
fw.write(s)
fw.write('\n')
else:
fw.write(json.dumps(bks)) # only for load back
def drop_duplicate(self,bks):
""" drop duplicated section
If I mark second time in same place, kindle will create two note,
so I need to remove the duplication record
Args:
bks: books dict
Return:
books remove duplicate sections
"""
[preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
for kb,vb in bks.items():
bks[kb]['lines'] = 0
# add copy() or throw RuntimeError: dictionary changed size during iteration
# reference - http://www.cocoachina.com/articles/89748
for ks, vs in vb.copy().items():
if ks in ['author', 'lines']: continue
bks[kb]['lines'] += 1
if (vs['content'] in prevs['content'] or \
prevs['content'] in vs['content']) and \
prevs['type'] == vs['type']:
bks[kb].pop(preks)
#if vs['content'] != prevs['content']:
# print('prevs',prevs['type'],prevs['content'])
# print(' vs', vs['type'], vs['content'])
preks = ks
prevs = vs
return bks
def add_note_to_highlight(self,bks):
""" append note content to corresponding highlight
and remove NT sections
Args:
bks: books dict
Return:
changed books
"""
[preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
for kb,vb in bks.items():
for ks,vs in vb.copy().items():
if ks in ['author', 'lines']: continue
if [prevs['type'], vs['type']] == ['HL','NT']:
bks[kb][preks]['content'] += str(NTPREF+vs['content'])
bks[kb].pop(ks)
preks = ks
prevs = vs
return bks
def search_clip(self,bks, s, t='ALL', p='ALL'):
"""search clip, searching scope may be title/author/content
Args:
input: bks: books dict
s: key word
t: 'ALL'
'HL'
'BM'
'NT'
p: 'ALL'
'TITLE'
'AUTHOR'
'CONTENT'
Return: search clipping content
"""
nbks = defaultdict(dict)
nu = 0
for kb,vb in bks.items():
nbks[kb]['lines'] = 0
for ks,vs in vb.copy().items():
if ks in ['author', 'lines']:
nbks[kb][ks] = vs
continue
if t in ['ALL', vs['type']]:
scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
found = re.search(s, scopestr[p])
if found:
nbks[kb][ks] = vs
nbks[kb]['lines'] += 1
nu += 1
if nbks[kb]['lines']==0:
nbks.pop(kb)
return [nu,nbks]
# to be implement
def statistic(self,bks):
pass
def dict2json(self,d):
"""convert dict to json
Args: d is the dict
Return: json string
"""
jstr = json.dumps(d)
return jstr
def json2dict(self,jf):
"""convert dict to json
Args: jf is the file saved json string
Return: dict
"""
d = {}
with open(jf, 'r', encoding='utf8', errors='ignore') as f:
d=json.load(f)
return d
def get_kindle_path(self):
"""check and return kindle device path
Args:
Return:
if kindle connected, return path string of kindle device
else return false
"""
cmd = "wmic logicaldisk get name,volumename" if os.name=='nt'\
else ("ls /Volumes/Kindle" if os.name=='posix' else '')
# not test for windows & linux
with os.popen(cmd) as s:
r = s.read()
if os.name == 'nt': # windows
for d in r.split('\n'):
if 'Kindle' in d: return d.split('\s+')[0]
elif os.name == 'posix': # mac os
if r: return('/Volumes/Kindle')
else:
pass
Returns:
dict like this:
d = { 'bookname':bookname,
bookname: {
'author':author
'0':{
'type':'HL',
'position':'123',
'day':'2020年5月26日',
'week':'星期二',
'meridiem':'PM',
'time':'10:26:31'
'content':content }}}
"""
# 1. highlight over the picture, the content(#3 line) is empty, only two lines
# 2. bookmark section only two lines
# 3. other not correct format < 2
if len(s)<=2:
return False
# parse #2 line
section = defaultdict(dict)
"""
authinfo = sec[0]
dateinfo = sec[1]
content = sec[2] if len(sec)==3 else None
"""
(authinfo, dateinfo, content) = \
(s[0], s[1], s[2] if len(s)==3 else None)
def import_clips(self, tp='local'):
"""import clips from local file or kindle
4 lines for each section seperated with '======='
so read 4 lines before '======='
das = da.search(dateinfo)
# type of section
'''
STAT :
START - start line of section
BM - section is a bookmark
HL - section is a highlight
NT - section is a note
'''
tpy = ('HL' if das.group(2)=='标注' else \
('NT' if das.group(2)=='笔记' else 'BM'))
"""
pos = das.group(1)
day = das.group(3)
week = das.group(4)
pmam = das.group(5)
time = das.group(6)
"""
(pos, x, day, week, pmam, time) = das.groups()[0:6]
# parse #1 line
aus = au.search(authinfo)
bookname = aus.group(1)
author = aus.group(2)
section[bookname]['author'] = author
section['bookname'] = bookname
section[bookname][str(i)] = {
'type':tpy,
'position':pos,
'day':day,
'week':week,
'meridiem':pmam,
'time':time,
'content':content }
return section
def format_time(ds):
""" format date
Args:
ds: 2020年1月13日 星期一 上午 8:11:05
Return:
2020/1/13 20:11:05
"""
d = ds.split(' ')
res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
ymd = '/'.join(res.groups())
res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
tm = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)
return ymd+tm
def format_data(bks, ft='MD'):
""" format data for MD & CSV
Args:
bks: books dict
f: can be 'MD'/'CSV'
Return:
list [header, sections]
header and sections are lists
"""
hd =[] # header
secs =[] # content
DELIMITER = '|' if ft=='MD' else ','
hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
if ft=='MD':
hd.append(DELIMITER.join(['--' for i in range(5)]))
for kb,vb in bks.items():
author = vb['author']
for ks, vs in vb.items():
if ks in ['author', 'lines']: continue
secs.append(DELIMITER.join([vs['type'],kb,author, \
format_time(' '.join([vs['day'],vs['week'],vs['meridiem'],vs['time']])),vs['content']]))
return hd+secs
def format_out(bks, fnpref, ft='MD'):
"""format output and write to file
markdown format:
TYPE | bookname | author | marktime | content
--|--|--|--|--
xx|xx|xx|xx|xx
CSV format:
TYPE,bookname,author,marktime,content
xx,xx,xx,xx,xx
marktime: 20200403 PM 3:0:3 星期五
Args:
bks: books dict
f: can be 'MD'/'JSON'/'CSV'
Returns: special format of 'bks' dict
"""
suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
op = fnpref+suff[ft]
with open(op, 'w', encoding='utf8', errors='ignore') as fw:
if ft=='JSON':
fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
elif ft in ['MD','CSV']:
for s in format_data(bks, ft):
fw.write(s)
fw.write('\n')
Args: tp: 'local' local clipping file
'kindle' kindle clipping file
Return: 0 - want to import kindle but kindle is not connected
books dict
"""
if tp=='kindle':
kp = get_kindle_path()
if not kp: return 0
else: path = kp
else:
fw.write(json.dumps(bks)) # only for load back
path = CLIPPATH
def drop_duplicate(bks):
""" drop duplicated section
# loop to fill books dict
with open(path, 'r', encoding='utf8', errors='ignore') as f:
bks = defaultdict(dict)
secd = defaultdict(dict)
sidx = 0
idx = 0
sec = []
for line in f.readlines():
line = line.strip()
if re.match(r'^\s*$',line): continue
idx += 1
If I mark second time in same place, kindle will create two note,
so I need to remove the duplication record
Args:
bks: books dict
Return:
books remove duplicate sections
"""
[preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
for kb,vb in bks.items():
bks[kb]['lines'] = 0
# add copy() or throw RuntimeError: dictionary changed size during iteration
# reference - http://www.cocoachina.com/articles/89748
for ks, vs in vb.copy().items():
if ks in ['author', 'lines']: continue
bks[kb]['lines'] += 1
if (vs['content'] in prevs['content'] or \
prevs['content'] in vs['content']) and \
prevs['type'] == vs['type']:
bks[kb].pop(preks)
#if vs['content'] != prevs['content']:
# print('prevs',prevs['type'],prevs['content'])
# print(' vs', vs['type'], vs['content'])
preks = ks
prevs = vs
return bks
def add_note_to_highlight(bks):
""" append note content to corresponding highlight
and remove NT sections
Args:
bks: books dict
Return:
changed books
"""
[preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
for kb,vb in bks.items():
for ks,vs in vb.copy().items():
if ks in ['author', 'lines']: continue
if [prevs['type'], vs['type']] == ['HL','NT']:
bks[kb][preks]['content'] += str(NTPREF+vs['content'])
bks[kb].pop(ks)
preks = ks
prevs = vs
return bks
def search_clip(bks, s, t='ALL', p='ALL'):
"""search clip, searching scope may be title/author/content
Args:
input: bks: books dict
s: key word
t: 'ALL'
'HL'
'BM'
'NT'
p: 'ALL'
'TITLE'
'AUTHOR'
'CONTENT'
Return: search clipping content
"""
nbks = defaultdict(dict)
nu = 0
for kb,vb in bks.items():
nbks[kb]['lines'] = 0
for ks,vs in vb.copy().items():
if ks in ['author', 'lines']:
nbks[kb][ks] = vs
continue
if t in ['ALL', vs['type']]:
scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
found = re.search(s, scopestr[p])
if found:
nbks[kb][ks] = vs
nbks[kb]['lines'] += 1
nu += 1
if nbks[kb]['lines']==0:
nbks.pop(kb)
return [nu,nbks]
# to be implement
def statistic(bks):
pass
def dict2json(d):
"""convert dict to json
Args: d is the dict
Return: json string
"""
jstr = json.dumps(d)
return jstr
def json2dict(jf):
"""convert dict to json
Args: jf is the file saved json string
Return: dict
"""
d = {}
with open(jf, 'r', encoding='utf8', errors='ignore') as f:
d=json.load(f)
return d
def import_clips():
# 4 lines for each section seperated with '======='
# so read 4 lines before '======='
# loop to fill books dict
with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
bks = defaultdict(dict)
secd = defaultdict(dict)
sidx = 0
idx = 0
sec = []
for line in f.readlines():
line = line.strip()
if re.match(r'^\s*$',line): continue
idx += 1
if not re.search(LASTLINE,line):
# content more than 1 line
if idx>3:
sec[2] += str(' '+line)
#logger.debug('idx {} {}'.format(idx, sec[2]))
if not re.search(LASTLINE,line):
# content more than 1 line
if idx>3:
sec[2] += str(' '+line)
#logger.debug('idx {} {}'.format(idx, sec[2]))
else:
sec.append(line)
#logger.debug('idx {} {}'.format(idx, sec[idx-1]))
else:
sec.append(line)
#logger.debug('idx {} {}'.format(idx, sec[idx-1]))
else:
idx = 0
sidx += 1
idx = 0
sidx += 1
# parsing section & fill data structure
secd = parse_section(sec,sidx)
# parsing section & fill data structure
secd = self.parse_section(sec,sidx)
if secd:
bn = secd['bookname']
tpy = secd[bn][str(sidx)]['type']
if secd:
bn = secd['bookname']
tpy = secd[bn][str(sidx)]['type']
bks[bn]['author'] = secd[bn]['author']
bks[bn][str(sidx)] = secd[bn][str(sidx)]
bks[bn]['author'] = secd[bn]['author']
bks[bn][str(sidx)] = secd[bn][str(sidx)]
# not add note to highlight content here,
# because NT maybe duplicated, we need remove duplication record before
"""
if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL':
bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
"""
# not add note to highlight content here,
# because NT maybe duplicated, we need remove duplication record before
"""
if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL':
bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
"""
if tpy=='HL': self.hlnum += 1
elif tpy=='NT': self.ntnum += 1
else: # BM or not correct format section
sidx -= 1
else: # BM or not correct format section
sidx -= 1
# initial section for next section loop
sec = []
return bks
# initial section for next section loop
sec = []
self.refleshtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
return bks
if __name__=='__main__':
#books = defaultdict(dict)
books = import_clips()
km = kMan()
books = km.import_clips('local')
# remove duplication
drop_duplicate(books)
km.drop_duplicate(books)
# test search note function
searchnote = search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
if searchnote[0] > 0: format_out(searchnote[1], 'searchcontent', ft='MD')
searchnote = search_clip(books, '经济', 'ALL', 'TITLE')
if searchnote[0] > 0: format_out(searchnote[1], 'searchtitle', ft='MD')
searchnote = search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
if searchnote[0] > 0: format_out(searchnote[1], 'searchauthor', ft='MD')
searchnote = km.search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
if searchnote[0] > 0: km.format_out(searchnote[1], 'searchcontent', ft='MD')
searchnote = km.search_clip(books, '经济', 'ALL', 'TITLE')
if searchnote[0] > 0: km.format_out(searchnote[1], 'searchtitle', ft='MD')
searchnote = km.search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
if searchnote[0] > 0: km.format_out(searchnote[1], 'searchauthor', ft='MD')
# add note content to hightlight, then delete note
add_note_to_highlight(books)
km.add_note_to_highlight(books)
# test dict json convert
with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
fw.write(dict2json(books))
if json2dict('./xx')==books: print( 'test OK')
fw.write(km.dict2json(books))
if km.json2dict('./xx')==books: print( 'test OK')
format_out(books, OUTPREF, ft='MD')
km.format_out(books, OUTPREF, ft='MD')
# print data with json format
logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))