kindle manager
This commit is contained in:
212
kman.py
212
kman.py
@@ -2,7 +2,7 @@
|
||||
#########################################################
|
||||
## @file : kman.py
|
||||
## @desc : kindle note managerment tool
|
||||
## @create : 20200526
|
||||
## @create : 2020/05/26
|
||||
## @author : Chengan
|
||||
## @email : douboer@gmail.com
|
||||
#########################################################
|
||||
@@ -63,6 +63,7 @@ CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
|
||||
OUTPREF = './clip'
|
||||
DEBUG = 1 # 0 - INFO; 1 - DEBUG
|
||||
LOG2FILE = 1 # 0 - to stdio; 1 - to file
|
||||
DELIMITER= '|'
|
||||
|
||||
# log info
|
||||
logger = logging.getLogger()
|
||||
@@ -125,8 +126,6 @@ def parse_section(s,i):
|
||||
'time':'10:26:31'
|
||||
'content':content }}}
|
||||
"""
|
||||
|
||||
|
||||
# 1. highlight over the picture, the content(#3 line) is empty, only two lines
|
||||
# 2. bookmark section only two lines
|
||||
# 3. other not correct format < 2
|
||||
@@ -135,9 +134,13 @@ def parse_section(s,i):
|
||||
|
||||
# parse #2 line
|
||||
section = defaultdict(dict)
|
||||
"""
|
||||
authinfo = sec[0]
|
||||
dateinfo = sec[1]
|
||||
content = sec[2] if len(sec)==3 else None
|
||||
"""
|
||||
(authinfo, dateinfo, content) = \
|
||||
(s[0], s[1], s[2] if len(s)==3 else None)
|
||||
|
||||
das = da.search(dateinfo)
|
||||
# type of section
|
||||
@@ -150,11 +153,14 @@ def parse_section(s,i):
|
||||
'''
|
||||
tpy = ('HL' if das.group(2)=='标注' else \
|
||||
('NT' if das.group(2)=='笔记' else 'BM'))
|
||||
"""
|
||||
pos = das.group(1)
|
||||
day = das.group(3)
|
||||
week = das.group(4)
|
||||
pmam = das.group(5)
|
||||
time = das.group(6)
|
||||
"""
|
||||
(pos, x, day, week, pmam, time) = das.groups()[0:6]
|
||||
|
||||
# parse #1 line
|
||||
aus = au.search(authinfo)
|
||||
@@ -174,9 +180,52 @@ def parse_section(s,i):
|
||||
|
||||
return section
|
||||
|
||||
def format_out(bks, ft='MD'):
|
||||
def format_time(ds):
|
||||
""" format date
|
||||
Args:
|
||||
ds: 2020年1月13日 星期一 上午 8:11:05
|
||||
Return:
|
||||
2020/1/13 20:11:05
|
||||
"""
|
||||
d = ds.split(' ')
|
||||
res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
|
||||
ymd = '/'.join(res.groups())
|
||||
res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
|
||||
tm = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)
|
||||
|
||||
return ymd+tm
|
||||
|
||||
def format_data(bks, ft='MD'):
|
||||
""" format data for MD & CSV
|
||||
|
||||
Args:
|
||||
bks: books dict
|
||||
f: can be 'MD'/'CSV'
|
||||
|
||||
Return:
|
||||
list [header, sections]
|
||||
header and sections are lists
|
||||
"""
|
||||
hd =[] # header
|
||||
secs =[] # content
|
||||
DELIMITER = '|' if ft=='MD' else ','
|
||||
|
||||
hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
|
||||
if ft=='MD':
|
||||
hd.append(DELIMITER.join(['--' for i in range(5)]))
|
||||
|
||||
for kb,vb in bks.items():
|
||||
author = vb['author']
|
||||
for ks, vs in vb.items():
|
||||
if ks in ['author', 'lines']: continue
|
||||
secs.append(DELIMITER.join([vs['type'],kb,author, \
|
||||
format_time(' '.join([vs['day'],vs['week'],vs['meridiem'],vs['time']])),vs['content']]))
|
||||
|
||||
return hd+secs
|
||||
|
||||
def format_out(bks, fnpref, ft='MD'):
|
||||
"""format output and write to file
|
||||
MARKDOWN format:
|
||||
markdown format:
|
||||
TYPE | bookname | author | marktime | content
|
||||
--|--|--|--|--
|
||||
xx|xx|xx|xx|xx
|
||||
@@ -195,15 +244,107 @@ def format_out(bks, ft='MD'):
|
||||
"""
|
||||
|
||||
suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
|
||||
op = OUTPREF+suff[ft]
|
||||
op = fnpref+suff[ft]
|
||||
|
||||
with open(op, 'w', encoding='gbk', errors='ignore') as fw:
|
||||
with open(op, 'w', encoding='utf8', errors='ignore') as fw:
|
||||
if ft=='JSON':
|
||||
ft.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
|
||||
elif ft=='MD':
|
||||
pass
|
||||
fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
|
||||
elif ft in ['MD','CSV']:
|
||||
for s in format_data(bks, ft):
|
||||
fw.write(s)
|
||||
fw.write('\n')
|
||||
else:
|
||||
ft.write(json.dumps(bks)) # only for load back
|
||||
fw.write(json.dumps(bks)) # only for load back
|
||||
|
||||
def drop_duplicate(bks):
|
||||
""" drop duplicated section
|
||||
|
||||
If I mark second time in same place, kindle will create two note,
|
||||
so I need to remove the duplication record
|
||||
|
||||
Args:
|
||||
bks: books dict
|
||||
Return:
|
||||
books remove duplicate sections
|
||||
"""
|
||||
[preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
|
||||
for kb,vb in bks.items():
|
||||
bks[kb]['lines'] = 0
|
||||
# add copy() or throw RuntimeError: dictionary changed size during iteration
|
||||
# reference - http://www.cocoachina.com/articles/89748
|
||||
for ks, vs in vb.copy().items():
|
||||
if ks in ['author', 'lines']: continue
|
||||
bks[kb]['lines'] += 1
|
||||
if (vs['content'] in prevs['content'] or \
|
||||
prevs['content'] in vs['content']) and \
|
||||
prevs['type'] == vs['type']:
|
||||
bks[kb].pop(preks)
|
||||
if vs['content'] != prevs['content']:
|
||||
print('prevs',prevs['type'],prevs['content'])
|
||||
print(' vs', vs['type'], vs['content'])
|
||||
|
||||
preks = ks
|
||||
prevs = vs
|
||||
|
||||
return bks
|
||||
|
||||
def add_note_to_highlight(bks):
|
||||
""" append note content to corresponding highlight
|
||||
and remove NT sections
|
||||
|
||||
Args:
|
||||
bks: books dict
|
||||
Return:
|
||||
changed books
|
||||
"""
|
||||
[preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
|
||||
for kb,vb in bks.items():
|
||||
for ks,vs in vb.copy().items():
|
||||
if ks in ['author', 'lines']: continue
|
||||
if [prevs['type'], vs['type']] == ['HL','NT']:
|
||||
bks[kb][preks]['content'] += str(NTPREF+vs['content'])
|
||||
bks[kb].pop(ks)
|
||||
|
||||
preks = ks
|
||||
prevs = vs
|
||||
|
||||
return bks
|
||||
|
||||
def search_clip(bks, s, t='ALL', p='ALL'):
|
||||
"""search clip, searching scope may be title/author/content
|
||||
Args:
|
||||
input: bks: books dict
|
||||
s: key word
|
||||
t: 'ALL'
|
||||
'HL'
|
||||
'BM'
|
||||
'NT'
|
||||
p: 'ALL'
|
||||
'TITLE'
|
||||
'AUTHOR'
|
||||
'CONTENT'
|
||||
Return: search clipping content
|
||||
"""
|
||||
nbks = defaultdict(dict)
|
||||
nu = 0
|
||||
for kb,vb in bks.items():
|
||||
nbks[kb]['lines'] = 0
|
||||
for ks,vs in vb.copy().items():
|
||||
if ks in ['author', 'lines']:
|
||||
nbks[kb][ks] = vs
|
||||
continue
|
||||
if t in ['ALL', vs['type']]:
|
||||
scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
|
||||
'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
|
||||
found = re.search(s, scopestr[p])
|
||||
if found:
|
||||
nbks[kb][ks] = vs
|
||||
nbks[kb]['lines'] += 1
|
||||
nu += 1
|
||||
if nbks[kb]['lines']==0:
|
||||
nbks.pop(kb)
|
||||
|
||||
return [nu,nbks]
|
||||
|
||||
def statistic(bks):
|
||||
pass
|
||||
@@ -226,26 +367,11 @@ def json2dict(jf):
|
||||
d=json.load(f)
|
||||
return d
|
||||
|
||||
def search_clip(bks, s, t='ALL', p='ALL'):
|
||||
"""search clip, searching scope may be title/author/content
|
||||
Args:
|
||||
input: bks: books dict
|
||||
s: key word
|
||||
t: 'ALL'
|
||||
'HL'
|
||||
'BM'
|
||||
'NT'
|
||||
p: 'ALL'
|
||||
'TITLE'
|
||||
'AUTHOR'
|
||||
'CONTENT'
|
||||
Return: search clipping content
|
||||
"""
|
||||
pass
|
||||
|
||||
if __name__=='__main__':
|
||||
# 4 lines for each section seperated with '======='
|
||||
# so read 4 lines before '======='
|
||||
|
||||
# loop to fill books dict
|
||||
with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
|
||||
books = defaultdict(dict)
|
||||
secd = defaultdict(dict)
|
||||
@@ -261,10 +387,10 @@ if __name__=='__main__':
|
||||
# content more than 1 line
|
||||
if idx>3:
|
||||
sec[2] += str(' '+line)
|
||||
logger.debug('idx {} {}'.format(idx, sec[2]))
|
||||
#logger.debug('idx {} {}'.format(idx, sec[2]))
|
||||
else:
|
||||
sec.append(line)
|
||||
logger.debug('idx {} {}'.format(idx, sec[idx-1]))
|
||||
#logger.debug('idx {} {}'.format(idx, sec[idx-1]))
|
||||
else:
|
||||
idx = 0
|
||||
sidx += 1
|
||||
@@ -279,19 +405,41 @@ if __name__=='__main__':
|
||||
books[bn]['author'] = secd[bn]['author']
|
||||
books[bn][str(sidx)] = secd[bn][str(sidx)]
|
||||
|
||||
# not add note to highlight content here,
|
||||
# because NT maybe duplicated, we need remove duplication record before
|
||||
"""
|
||||
if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL':
|
||||
books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
|
||||
"""
|
||||
|
||||
else: # BM or not correct format section
|
||||
sidx -= 1
|
||||
|
||||
# initial section for next section loop
|
||||
sec = []
|
||||
|
||||
# remove duplication
|
||||
drop_duplicate(books)
|
||||
|
||||
# test search note function
|
||||
searchnote = search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
|
||||
if searchnote[0] > 0: format_out(searchnote[1], 'searchcontent', ft='MD')
|
||||
searchnote = search_clip(books, '经济', 'ALL', 'TITLE')
|
||||
if searchnote[0] > 0: format_out(searchnote[1], 'searchtitle', ft='MD')
|
||||
searchnote = search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
|
||||
if searchnote[0] > 0: format_out(searchnote[1], 'searchauthor', ft='MD')
|
||||
|
||||
# add note content to hightlight, then delete note
|
||||
add_note_to_highlight(books)
|
||||
|
||||
# test dict json convert
|
||||
with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
|
||||
fw.write(dict2json(books))
|
||||
if json2dict('./xx')==books: print( 'test OK')
|
||||
|
||||
# print data with json format
|
||||
logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))
|
||||
format_out(books, OUTPREF, ft='MD')
|
||||
|
||||
# print data with json format
|
||||
logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user