Files
kman/kman.py
2020-05-28 21:10:46 +08:00

298 lines
7.8 KiB
Python

#########################################################
## @file : kman.py
## @desc : kindle note managerment tool
## @create : 20200526
## @author : Chengan
## @email : douboer@gmail.com
#########################################################
import re
import json
import logging
import platform
from collections import defaultdict
# data structure - use dict
'''
books =
{
"bookname_xxx": {
"author": "",
"section1636": {
"content": "",
"day": "2020年4月3日",
"meridiem": "下午",
"position": "311-311",
"time": "3:00:53",
"type": "HL",
"week": "星期五"
},
"section1651": {
"content": "",
"day": "2020年4月3日",
"meridiem": "下午",
"position": "514",
"time": "3:43:50",
"type": "NT",
"week": "星期五"
},
"section1814": {
"content": null,
"day": "2020年4月12日",
"meridiem": "下午",
"position": "5186",
"time": "2:20:12",
"type": "BM",
"week": "星期日"
},
...
},
...
}
'''
# modi clippath for different os
SYS = 'WIN' if platform.system()=='Windows' else \
('LINUX' if platform.system()=='LINUX' else 'MAC')
# some constants
LASTLINE = '=========='
NTPREF = '--CG注:'
CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
OUTPREF = './clip'
DEBUG = 1 # 0 - INFO; 1 - DEBUG
LOG2FILE = 1 # 0 - to stdio; 1 - to file
# log info
logger = logging.getLogger()
#formatter = logging.Formatter
# ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
formatter = logging.Formatter('')
if LOG2FILE:
handler = logging.FileHandler("log")
handler.setFormatter(formatter)
logger.addHandler(handler)
else:
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
if DEBUG:
logger.setLevel(logging.DEBUG)
#author & bookname info
au = re.compile(
r'''
^\ufeff*
(.+) \( #bookname
(.+)\) #author
''', flags=re.X )
# page & date info
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
da = re.compile(
r'''
\#
(\d+-{0,1}\d+) #group1 - page
.+
(笔记|标注|书签) #group2 - type
.+
(\d{4}\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日
(星期.) #group4 - week
\s
(..) #group5 - pm/am
(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time
''', flags=re.X )
def parse_section(s,i):
"""parse section
Args:
s: section dict
i: section index
Returns:
dict like this:
d = { 'bookname':bookname,
bookname: {
'author':author
'section0':{
'type':'HL',
'position':'123',
'day':'2020年5月26日',
'week':'星期二',
'meridiem':'PM',
'time':'10:26:31'
'content':content }}}
"""
# 1. highlight over the picture, the content(#3 line) is empty, only two lines
# 2. bookmark section only two lines
# 3. other not correct format < 2
if len(s)<=2:
return False
# parse #2 line
section = defaultdict(dict)
authinfo = sec[0]
dateinfo = sec[1]
content = sec[2] if len(sec)==3 else None
das = da.search(dateinfo)
# type of section
'''
STAT :
START - start line of section
BM - section is a bookmark
HL - section is a highlight
NT - section is a note
'''
tpy = ('HL' if das.group(2)=='标注' else \
('NT' if das.group(2)=='笔记' else 'BM'))
pos = das.group(1)
day = das.group(3)
week = das.group(4)
pmam = das.group(5)
time = das.group(6)
# parse #1 line
aus = au.search(authinfo)
bookname = aus.group(1)
author = aus.group(2)
section[bookname]['author'] = author
section['bookname'] = bookname
section[bookname][str(i)] = {
'type':tpy,
'position':pos,
'day':day,
'week':week,
'meridiem':pmam,
'time':time,
'content':content }
return section
def format_out(bks, ft='MD'):
"""format output and write to file
MARKDOWN format:
TYPE | bookname | author | marktime | content
--|--|--|--|--
xx|xx|xx|xx|xx
CSV format:
TYPE,bookname,author,marktime,content
xx,xx,xx,xx,xx
marktime: 20200403 PM 3:0:3 星期五
Args:
bks: books dict
f: can be 'MD'/'JSON'/'CSV'
Returns: special format of 'bks' dict
"""
suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
op = OUTPREF+suff[ft]
with open(op, 'w', encoding='gbk', errors='ignore') as fw:
if ft=='JSON':
ft.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
elif ft=='MD':
pass
else:
ft.write(json.dumps(bks)) # only for load back
def statistic(bks):
pass
def dict2json(d):
"""convert dict to json
Args: d is the dict
Return: json string
"""
jstr = json.dumps(d)
return jstr
def json2dict(jf):
"""convert dict to json
Args: jf is the file saved json string
Return: dict
"""
d = {}
with open(jf, 'r', encoding='utf8', errors='ignore') as f:
d=json.load(f)
return d
def search_clip(bks, s, t='ALL', p='ALL'):
"""search clip, searching scope may be title/author/content
Args:
input: bks: books dict
s: key word
t: 'ALL'
'HL'
'BM'
'NT'
p: 'ALL'
'TITLE'
'AUTHOR'
'CONTENT'
Return: search clipping content
"""
pass
if __name__=='__main__':
# 4 lines for each section seperated with '======='
# so read 4 lines before '======='
with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
books = defaultdict(dict)
secd = defaultdict(dict)
sidx = 0
idx = 0
sec = []
for line in f.readlines():
line = line.strip()
if re.match(r'^\s*$',line): continue
idx += 1
if not re.search(LASTLINE,line):
# content more than 1 line
if idx>3:
sec[2] += str(' '+line)
logger.debug('idx {} {}'.format(idx, sec[2]))
else:
sec.append(line)
logger.debug('idx {} {}'.format(idx, sec[idx-1]))
else:
idx = 0
sidx += 1
# parsing section & fill data structure
secd = parse_section(sec,sidx)
if secd:
bn = secd['bookname']
tpy = secd[bn][str(sidx)]['type']
books[bn]['author'] = secd[bn]['author']
books[bn][str(sidx)] = secd[bn][str(sidx)]
if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL':
books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
else: # BM or not correct format section
sidx -= 1
# initial section for next section loop
sec = []
# test dict json convert
with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
fw.write(dict2json(books))
if json2dict('./xx')==books: print( 'test OK')
# print data with json format
logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))