######################################################### ## @file : kman.py ## @desc : kindle note managerment tool ## @create : 2020/05/26 ## @author : Chengan ## @email : douboer@gmail.com ######################################################### import re import os import io import json import time import logging import platform import subprocess from collections import defaultdict # data structure - use dict ''' books = { "bookname_xxx": { "author": "李", "section1636": { "content": "张", "day": "2020年4月3日", "meridiem": "下午", "position": "311-311", "time": "3:00:53", "type": "HL", "week": "星期五" }, "section1651": { "content": "治", "day": "2020年4月3日", "meridiem": "下午", "position": "514", "time": "3:43:50", "type": "NT", "week": "星期五" }, "section1814": { "content": null, "day": "2020年4月12日", "meridiem": "下午", "position": "5186", "time": "2:20:12", "type": "BM", "week": "星期日" }, ... }, ... } ''' # modi clippath for different os SYS = 'WIN' if platform.system()=='Windows' else \ ('LINUX' if platform.system()=='LINUX' else 'MAC') # some constants LASTLINE = '==========' NTPREF = '--CG注:' CLIPFN = 'My Clippings.txt' CLIPPATH = './' # /Volumes/Kindle/documents/My\ Clippings.txt #CLIPPATH = './tclip.txt' OUTPREF = './clip' DEBUG = 1 # 0 - INFO; 1 - DEBUG LOG2FILE = 1 # 0 - to stdio; 1 - to file LOGFILE = 'log' DELIMITER= '|' #HEADER = {0:'type',1:'bookname',2:'author',3:'position',4:'date',5:'content'} # log info logger = logging.getLogger() #formatter = logging.Formatter # ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s') formatter = logging.Formatter('') if LOG2FILE: handler = logging.FileHandler(LOGFILE) handler.setFormatter(formatter) logger.addHandler(handler) else: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) if DEBUG: logger.setLevel(logging.DEBUG) #author & bookname info au = re.compile( r''' ^\ufeff* (.+) \( #bookname (.+)\) #author ''', flags=re.X ) # page & date info #\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\) da = re.compile( r''' \# (\d+-{0,1}\d+) #group1 - page .+ (笔记|标注|书签) #group2 - type .+ (\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日 (星期.) #group4 - week \s (..) #group5 - pm/am (\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time ''', flags=re.X ) class kMan: def __init__(self, parent=None): self.hlnum = 0 self.ntnum = 0 self.refleshtime = '2020/10/10 10:00:00' self.status = self.status_info() def status_info(self): s1 = u'Hightlight: {} Note: {} RefleshTime: {}'. \ format(self.hlnum,self.ntnum,self.refleshtime) kp = self.get_kindle_path() if not kp: s2 = u'Disconnected ({})'.format(CLIPPATH+CLIPFN) else: with open(kp+'/system/version.txt' , 'r', encoding='utf8', errors='ignore') as f: s2 = u'Connected ({}) version {}'.format(kp,f.read().strip()) return [s1,s2] def parse_section(self,s,i): """parse section Args: s: section line list i: section index Returns: dict like this: d = { 'bookname':bookname, bookname: { 'author':author '0':{ 'type':'HL', 'position':'123', 'day':'2020年5月26日', 'week':'星期二', 'meridiem':'PM', 'time':'10:26:31' 'content':content }}} """ # 1. highlight over the picture, the content(#3 line) is empty, only two lines # 2. bookmark section only two lines # 3. other not correct format < 2 if len(s)<=2: return False # parse #2 line section = defaultdict(dict) """ authinfo = sec[0] dateinfo = sec[1] content = sec[2] if len(sec)==3 else None """ (authinfo, dateinfo, content) = \ (s[0], s[1], s[2] if len(s)==3 else None) das = da.search(dateinfo) # type of section ''' STAT : START - start line of section BM - section is a bookmark HL - section is a highlight NT - section is a note ''' tpy = ('HL' if das.group(2)=='标注' else \ ('NT' if das.group(2)=='笔记' else 'BM')) """ pos = das.group(1) day = das.group(3) week = das.group(4) pmam = das.group(5) time = das.group(6) """ (pos, x, day, week, pmam, time) = das.groups()[0:6] # parse #1 line aus = au.search(authinfo) bookname = aus.group(1).strip().replace(' ','') author = aus.group(2).strip().replace(' ','') section[bookname]['author'] = author section['bookname'] = bookname section[bookname][str(i)] = { 'type':tpy, 'position':pos, 'day':day, 'week':week, 'meridiem':pmam, 'time':time, 'content':content } return section def format_time(self,ds): """ format date Args: ds: 2020年1月13日 星期一 上午 8:11:05 Return: 2020/1/13 20:11:05 """ d = ds.split(' ') res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0]) ymd = '/'.join(res.groups()) res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3]) tm = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2) return ymd+tm def format_data(self,bks, ft='MD'): """ format data for MD & CSV Args: bks: books dict f: can be 'MD'/'CSV' Return: list [header, sections] header and sections are lists """ hd =[] # header secs =[] # content DELIMITER = '|' if ft=='MD' else ',' hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT'])) if ft=='MD': hd.append(DELIMITER.join(['--' for i in range(5)])) for kb,vb in bks.items(): author = vb['author'] for ks, vs in vb.items(): if ks in ['author', 'lines']: continue secs.append(DELIMITER.join([vs['type'],kb,author, \ self.format_time(' '.join([vs['day'],vs['week'],\ vs['meridiem'],vs['time']])),vs['content']])) return hd+secs def format_out(self,bks, fnpref, ft='MD'): """format output and write to file markdown format: TYPE | bookname | author | marktime | content --|--|--|--|-- xx|xx|xx|xx|xx CSV format: TYPE,bookname,author,marktime,content xx,xx,xx,xx,xx marktime: 20200403 PM 3:0:3 星期五 Args: bks: books dict f: can be 'MD'/'JSON'/'CSV' Returns: special format of 'bks' dict """ suff = {'MD':'.md','CSV':'.csv','JSON':'.json'} op = fnpref+suff[ft] with open(op, 'w', encoding='utf8', errors='ignore') as fw: if ft=='JSON': fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False)) elif ft in ['MD','CSV']: for s in self.format_data(bks, ft): fw.write(s) fw.write('\n') else: fw.write(json.dumps(bks)) # only for load back def drop_duplicate(self,bks): """ drop duplicated section If I mark second time in same place, kindle will create two note, so I need to remove the duplication record Args: bks: books dict Return: books remove duplicate sections """ [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}] for kb,vb in bks.items(): bks[kb]['lines'] = 0 # add copy() or throw RuntimeError: dictionary changed size during iteration # reference - http://www.cocoachina.com/articles/89748 for ks, vs in vb.copy().items(): if ks in ['author', 'lines']: continue bks[kb]['lines'] += 1 if (vs['content'] in prevs['content'] or \ prevs['content'] in vs['content']) and \ prevs['type'] == vs['type']: bks[kb].pop(preks) #if vs['content'] != prevs['content']: # print('prevs',prevs['type'],prevs['content']) # print(' vs', vs['type'], vs['content']) preks = ks prevs = vs return bks def get_bookname_num(self,bks): """ get note number of booknames Args: bks: books dict Return: dict {bookname:num,...} """ bksnum = defaultdict(dict) nu = 0 for kb,vb in bks.items(): bksnum.setdefault(kb, 0) for ks, vs in vb.copy().items(): if ks in ['author', 'lines']: continue bksnum[kb] += 1 nu += 1 return [nu, bksnum] def get_author_num(self,bks): """ get note number of author Args: bks: books dict Return: dict {bookname:num,...} """ bksnum = defaultdict(dict) nu = 0 for kb,vb in bks.items(): for ks, vs in vb.copy().items(): if ks in ['author', 'lines']: continue au = vb['author'] bksnum.setdefault(au, 0) bksnum[au] += 1 nu += 1 return [nu, bksnum] def filter_clips(self, bks, info=None, tp=0): """ filter clips Args: bks: books dict info: filter by bookname or author information tp: type to be filter 0: root item clicked 1: bookname item clicked 2: author item clicked Return: list: [[Type,Bookname,Author,Position,Date,content], [Type,Bookname,Author,Position,Date,content] ....] """ nbks = defaultdict(dict) # do not filter if tp==0: nbks = bks # do filter for kb, vb in bks.items(): if [info, tp] in ([kb, 1], [vb['author'], 2]): nbks[kb] = vb seclist = [] idx = 0 for kb, vb in nbks.items(): for ks,vs in vb.items(): if ks in ['author', 'lines']: continue tm = self.format_time(' '.join([vs['day'],vs['week'], \ vs['meridiem'],vs['time']])) nttype = '标注' if vs['type']=='HL' else '笔记' seclist.append([nttype,kb,vb['author'],vs['position'],tm,vs['content']]) idx += 1 return seclist def add_note_to_highlight(self,bks): """ append note content to corresponding highlight and remove NT sections Args: bks: books dict Return: changed books """ [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}] for kb,vb in bks.items(): for ks,vs in vb.copy().items(): if ks in ['author', 'lines']: continue if [prevs['type'], vs['type']] == ['HL','NT']: bks[kb][preks]['content'] += str(NTPREF+vs['content']) bks[kb].pop(ks) preks = ks prevs = vs return bks def search_clip(self,bks, s, t='ALL', p='ALL'): """search clip, searching scope may be title/author/content Args: input: bks: books dict s: key word t: 'ALL' 'HL' 'BM' 'NT' p: 'ALL' 'TITLE' 'AUTHOR' 'CONTENT' Return: [number of result , result dict] """ nbks = defaultdict(dict) nu = 0 for kb,vb in bks.items(): nbks[kb]['lines'] = 0 for ks,vs in vb.copy().items(): if ks in ['author', 'lines']: nbks[kb][ks] = vs continue if t in ['ALL', vs['type']]: scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \ 'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']} found = re.search(s, scopestr[p]) if found: nbks[kb][ks] = vs nbks[kb]['lines'] += 1 nu += 1 if nbks[kb]['lines']==0: nbks.pop(kb) return [nu,nbks] # to be implement def statistic(self,bks): pass def dict2json(self,d): """convert dict to json Args: d is the dict Return: json string """ jstr = json.dumps(d) return jstr def json2dict(self,jf): """convert dict to json Args: jf is the file saved json string Return: dict """ d = {} with open(jf, 'r', encoding='utf8', errors='ignore') as f: d=json.load(f) return d def get_kindle_path(self): """check and return kindle device path Args: Return: if kindle connected, return path string of kindle device else return false """ cmd = "wmic logicaldisk get name,volumename" if os.name=='nt'\ else ("ls /Volumes/Kindle" if os.name=='posix' else '') # not test for windows & linux with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, \ stderr=subprocess.PIPE, bufsize=-1) as s: stream_stdout = io.TextIOWrapper(s.stdout, encoding='utf-8') stream_stderr = io.TextIOWrapper(s.stderr, encoding='utf-8') sout = str(stream_stdout.read()) #serr = str(stream_stderr.read()) #if sout: print('stdout {}'.format(sout)) #if serr: print('stderr {}'.format(serr)) if os.name == 'nt': # windows for d in sout.split('\n'): if 'Kindle' in d: return d.split('\s+')[0] elif os.name == 'posix': # mac os if sout: return('/Volumes/Kindle/') else: pass """ # will print error information on stdout with os.popen(cmd) as s: sout = s.read() if os.name == 'nt': # windows for d in sout.split('\n'): if 'Kindle' in d: return d.split('\s+')[0] elif os.name == 'posix': # mac os if sout: return('/Volumes/Kindle') else: pass """ return False def import_clips(self, fp=(CLIPPATH+CLIPFN)): """import clips from local file or kindle 4 lines for each section seperated with '=======' so read 4 lines before '=======' Args: fp - file path Return: 0 - want to import kindle but kindle is not connected books dict """ # check kindle by user just call get_kindle_path() """ if tp=='kindle': kp = get_kindle_path() if not kp: return 0 else: path = kp else: path = fn """ # loop to fill books dict with open(fp, 'r', encoding='utf8', errors='ignore') as f: bks = defaultdict(dict) secd = defaultdict(dict) sidx = 0 idx = 0 sec = [] for line in f.readlines(): line = line.strip() if re.match(r'^\s*$',line): continue idx += 1 if not re.search(LASTLINE,line): # content more than 1 line if idx>3: sec[2] += str(' '+line) #logger.debug('idx {} {}'.format(idx, sec[2])) else: sec.append(line) #logger.debug('idx {} {}'.format(idx, sec[idx-1])) else: idx = 0 sidx += 1 # parsing section & fill data structure secd = self.parse_section(sec,sidx) if secd: bn = secd['bookname'] tpy = secd[bn][str(sidx)]['type'] bks[bn]['author'] = secd[bn]['author'] bks[bn][str(sidx)] = secd[bn][str(sidx)] # not add note to highlight content here, # because NT maybe duplicated, we need remove duplication record before """ if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL': bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2]) """ if tpy=='HL': self.hlnum += 1 elif tpy=='NT': self.ntnum += 1 else: # BM or not correct format section sidx -= 1 # initial section for next section loop sec = [] self.refleshtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) return bks if __name__=='__main__': #books = defaultdict(dict) km = kMan() books = km.import_clips() # remove duplication km.drop_duplicate(books) # test search note function searchnote = km.search_clip(books, '三大都市圈', 'ALL', 'CONTENT') if searchnote[0] > 0: km.format_out(searchnote[1], 'searchcontent', ft='MD') searchnote = km.search_clip(books, '经济', 'ALL', 'TITLE') if searchnote[0] > 0: km.format_out(searchnote[1], 'searchtitle', ft='MD') searchnote = km.search_clip(books, '巴曙松', 'ALL', 'AUTHOR') if searchnote[0] > 0: km.format_out(searchnote[1], 'searchauthor', ft='MD') print(km.get_bookname_num(books)) print(km.get_author_num(books)) # add note content to hightlight, then delete note km.add_note_to_highlight(books) # test dict json convert with open('./xx', 'w', encoding='utf8', errors='ignore') as fw: fw.write(km.dict2json(books)) if km.json2dict('./xx')==books: print( 'test OK') km.format_out(books, OUTPREF, ft='MD') # print data with json format logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))