######################################################### ## @file : kman.py ## @desc : kindle note managerment tool ## @create : 2020/05/26 ## @author : Chengan ## @email : douboer@gmail.com ######################################################### import re import json import logging import platform from collections import defaultdict # data structure - use dict ''' books = { "bookname_xxx": { "author": "李", "section1636": { "content": "张", "day": "2020年4月3日", "meridiem": "下午", "position": "311-311", "time": "3:00:53", "type": "HL", "week": "星期五" }, "section1651": { "content": "治", "day": "2020年4月3日", "meridiem": "下午", "position": "514", "time": "3:43:50", "type": "NT", "week": "星期五" }, "section1814": { "content": null, "day": "2020年4月12日", "meridiem": "下午", "position": "5186", "time": "2:20:12", "type": "BM", "week": "星期日" }, ... }, ... } ''' # modi clippath for different os SYS = 'WIN' if platform.system()=='Windows' else \ ('LINUX' if platform.system()=='LINUX' else 'MAC') # some constants LASTLINE = '==========' NTPREF = '--CG注:' #CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt CLIPPATH = './tclip.txt' OUTPREF = './clip' DEBUG = 1 # 0 - INFO; 1 - DEBUG LOG2FILE = 1 # 0 - to stdio; 1 - to file DELIMITER= '|' # log info logger = logging.getLogger() #formatter = logging.Formatter # ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s') formatter = logging.Formatter('') if LOG2FILE: handler = logging.FileHandler("log") handler.setFormatter(formatter) logger.addHandler(handler) else: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) if DEBUG: logger.setLevel(logging.DEBUG) #author & bookname info au = re.compile( r''' ^\ufeff* (.+) \( #bookname (.+)\) #author ''', flags=re.X ) # page & date info #\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\) da = re.compile( r''' \# (\d+-{0,1}\d+) #group1 - page .+ (笔记|标注|书签) #group2 - type .+ (\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日 (星期.) #group4 - week \s (..) #group5 - pm/am (\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time ''', flags=re.X ) def parse_section(s,i): """parse section Args: s: section dict i: section index Returns: dict like this: d = { 'bookname':bookname, bookname: { 'author':author 'section0':{ 'type':'HL', 'position':'123', 'day':'2020年5月26日', 'week':'星期二', 'meridiem':'PM', 'time':'10:26:31' 'content':content }}} """ # 1. highlight over the picture, the content(#3 line) is empty, only two lines # 2. bookmark section only two lines # 3. other not correct format < 2 if len(s)<=2: return False # parse #2 line section = defaultdict(dict) """ authinfo = sec[0] dateinfo = sec[1] content = sec[2] if len(sec)==3 else None """ (authinfo, dateinfo, content) = \ (s[0], s[1], s[2] if len(s)==3 else None) das = da.search(dateinfo) # type of section ''' STAT : START - start line of section BM - section is a bookmark HL - section is a highlight NT - section is a note ''' tpy = ('HL' if das.group(2)=='标注' else \ ('NT' if das.group(2)=='笔记' else 'BM')) """ pos = das.group(1) day = das.group(3) week = das.group(4) pmam = das.group(5) time = das.group(6) """ (pos, x, day, week, pmam, time) = das.groups()[0:6] # parse #1 line aus = au.search(authinfo) bookname = aus.group(1) author = aus.group(2) section[bookname]['author'] = author section['bookname'] = bookname section[bookname][str(i)] = { 'type':tpy, 'position':pos, 'day':day, 'week':week, 'meridiem':pmam, 'time':time, 'content':content } return section def format_time(ds): """ format date Args: ds: 2020年1月13日 星期一 上午 8:11:05 Return: 2020/1/13 20:11:05 """ d = ds.split(' ') res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0]) ymd = '/'.join(res.groups()) res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3]) tm = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2) return ymd+tm def format_data(bks, ft='MD'): """ format data for MD & CSV Args: bks: books dict f: can be 'MD'/'CSV' Return: list [header, sections] header and sections are lists """ hd =[] # header secs =[] # content DELIMITER = '|' if ft=='MD' else ',' hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT'])) if ft=='MD': hd.append(DELIMITER.join(['--' for i in range(5)])) for kb,vb in bks.items(): author = vb['author'] for ks, vs in vb.items(): if ks in ['author', 'lines']: continue secs.append(DELIMITER.join([vs['type'],kb,author, \ format_time(' '.join([vs['day'],vs['week'],vs['meridiem'],vs['time']])),vs['content']])) return hd+secs def format_out(bks, fnpref, ft='MD'): """format output and write to file markdown format: TYPE | bookname | author | marktime | content --|--|--|--|-- xx|xx|xx|xx|xx CSV format: TYPE,bookname,author,marktime,content xx,xx,xx,xx,xx marktime: 20200403 PM 3:0:3 星期五 Args: bks: books dict f: can be 'MD'/'JSON'/'CSV' Returns: special format of 'bks' dict """ suff = {'MD':'.md','CSV':'.csv','JSON':'.json'} op = fnpref+suff[ft] with open(op, 'w', encoding='utf8', errors='ignore') as fw: if ft=='JSON': fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False)) elif ft in ['MD','CSV']: for s in format_data(bks, ft): fw.write(s) fw.write('\n') else: fw.write(json.dumps(bks)) # only for load back def drop_duplicate(bks): """ drop duplicated section If I mark second time in same place, kindle will create two note, so I need to remove the duplication record Args: bks: books dict Return: books remove duplicate sections """ [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}] for kb,vb in bks.items(): bks[kb]['lines'] = 0 # add copy() or throw RuntimeError: dictionary changed size during iteration # reference - http://www.cocoachina.com/articles/89748 for ks, vs in vb.copy().items(): if ks in ['author', 'lines']: continue bks[kb]['lines'] += 1 if (vs['content'] in prevs['content'] or \ prevs['content'] in vs['content']) and \ prevs['type'] == vs['type']: bks[kb].pop(preks) #if vs['content'] != prevs['content']: # print('prevs',prevs['type'],prevs['content']) # print(' vs', vs['type'], vs['content']) preks = ks prevs = vs return bks def add_note_to_highlight(bks): """ append note content to corresponding highlight and remove NT sections Args: bks: books dict Return: changed books """ [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}] for kb,vb in bks.items(): for ks,vs in vb.copy().items(): if ks in ['author', 'lines']: continue if [prevs['type'], vs['type']] == ['HL','NT']: bks[kb][preks]['content'] += str(NTPREF+vs['content']) bks[kb].pop(ks) preks = ks prevs = vs return bks def search_clip(bks, s, t='ALL', p='ALL'): """search clip, searching scope may be title/author/content Args: input: bks: books dict s: key word t: 'ALL' 'HL' 'BM' 'NT' p: 'ALL' 'TITLE' 'AUTHOR' 'CONTENT' Return: search clipping content """ nbks = defaultdict(dict) nu = 0 for kb,vb in bks.items(): nbks[kb]['lines'] = 0 for ks,vs in vb.copy().items(): if ks in ['author', 'lines']: nbks[kb][ks] = vs continue if t in ['ALL', vs['type']]: scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \ 'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']} found = re.search(s, scopestr[p]) if found: nbks[kb][ks] = vs nbks[kb]['lines'] += 1 nu += 1 if nbks[kb]['lines']==0: nbks.pop(kb) return [nu,nbks] # to be implement def statistic(bks): pass def dict2json(d): """convert dict to json Args: d is the dict Return: json string """ jstr = json.dumps(d) return jstr def json2dict(jf): """convert dict to json Args: jf is the file saved json string Return: dict """ d = {} with open(jf, 'r', encoding='utf8', errors='ignore') as f: d=json.load(f) return d def import_clips(): # 4 lines for each section seperated with '=======' # so read 4 lines before '=======' # loop to fill books dict with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f: bks = defaultdict(dict) secd = defaultdict(dict) sidx = 0 idx = 0 sec = [] for line in f.readlines(): line = line.strip() if re.match(r'^\s*$',line): continue idx += 1 if not re.search(LASTLINE,line): # content more than 1 line if idx>3: sec[2] += str(' '+line) #logger.debug('idx {} {}'.format(idx, sec[2])) else: sec.append(line) #logger.debug('idx {} {}'.format(idx, sec[idx-1])) else: idx = 0 sidx += 1 # parsing section & fill data structure secd = parse_section(sec,sidx) if secd: bn = secd['bookname'] tpy = secd[bn][str(sidx)]['type'] bks[bn]['author'] = secd[bn]['author'] bks[bn][str(sidx)] = secd[bn][str(sidx)] # not add note to highlight content here, # because NT maybe duplicated, we need remove duplication record before """ if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL': bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2]) """ else: # BM or not correct format section sidx -= 1 # initial section for next section loop sec = [] return bks if __name__=='__main__': #books = defaultdict(dict) books = import_clips() # remove duplication drop_duplicate(books) # test search note function searchnote = search_clip(books, '三大都市圈', 'ALL', 'CONTENT') if searchnote[0] > 0: format_out(searchnote[1], 'searchcontent', ft='MD') searchnote = search_clip(books, '经济', 'ALL', 'TITLE') if searchnote[0] > 0: format_out(searchnote[1], 'searchtitle', ft='MD') searchnote = search_clip(books, '巴曙松', 'ALL', 'AUTHOR') if searchnote[0] > 0: format_out(searchnote[1], 'searchauthor', ft='MD') # add note content to hightlight, then delete note add_note_to_highlight(books) # test dict json convert with open('./xx', 'w', encoding='utf8', errors='ignore') as fw: fw.write(dict2json(books)) if json2dict('./xx')==books: print( 'test OK') format_out(books, OUTPREF, ft='MD') # print data with json format logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))