######################################################### ## @file : kman.py ## @desc : kindle note managerment tool ## @create : 20200526 ## @author : Chengan ## @email : douboer@gmail.com ######################################################### import re import json import logging import platform from collections import defaultdict # data structure - use dict ''' books = { "bookname_xxx": { "author": "李", "section1636": { "content": "张", "day": "2020年4月3日", "meridiem": "下午", "position": "311-311", "time": "3:00:53", "type": "HL", "week": "星期五" }, "section1651": { "content": "治", "day": "2020年4月3日", "meridiem": "下午", "position": "514", "time": "3:43:50", "type": "NT", "week": "星期五" }, "section1814": { "content": null, "day": "2020年4月12日", "meridiem": "下午", "position": "5186", "time": "2:20:12", "type": "BM", "week": "星期日" }, ... }, ... } ''' # modi clippath for different os SYS = 'WIN' if platform.system()=='Windows' else \ ('LINUX' if platform.system()=='LINUX' else 'MAC') # some constants LASTLINE = '==========' NTPREF = '--CG注:' CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt OUTPREF = './clip' DEBUG = 1 # 0 - INFO; 1 - DEBUG LOG2FILE = 1 # 0 - to stdio; 1 - to file # log info logger = logging.getLogger() #formatter = logging.Formatter # ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s') formatter = logging.Formatter('') if LOG2FILE: handler = logging.FileHandler("log") handler.setFormatter(formatter) logger.addHandler(handler) else: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) if DEBUG: logger.setLevel(logging.DEBUG) #author & bookname info au = re.compile( r''' ^\ufeff* (.+) \( #bookname (.+)\) #author ''', flags=re.X ) # page & date info #\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\) da = re.compile( r''' \# (\d+-{0,1}\d+) #group1 - page .+ (笔记|标注|书签) #group2 - type .+ (\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日 (星期.) #group4 - week \s (..) #group5 - pm/am (\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time ''', flags=re.X ) def parse_section(s,i): """parse section Args: s: section dict i: section index Returns: dict like this: d = { 'bookname':bookname, bookname: { 'author':author 'section0':{ 'type':'HL', 'position':'123', 'day':'2020年5月26日', 'week':'星期二', 'meridiem':'PM', 'time':'10:26:31' 'content':content }}} """ # 1. highlight over the picture, the content(#3 line) is empty, only two lines # 2. bookmark section only two lines # 3. other not correct format < 2 if len(s)<=2: return False # parse #2 line section = defaultdict(dict) authinfo = sec[0] dateinfo = sec[1] content = sec[2] if len(sec)==3 else None das = da.search(dateinfo) # type of section ''' STAT : START - start line of section BM - section is a bookmark HL - section is a highlight NT - section is a note ''' tpy = ('HL' if das.group(2)=='标注' else \ ('NT' if das.group(2)=='笔记' else 'BM')) pos = das.group(1) day = das.group(3) week = das.group(4) pmam = das.group(5) time = das.group(6) # parse #1 line aus = au.search(authinfo) bookname = aus.group(1) author = aus.group(2) section[bookname]['author'] = author section['bookname'] = bookname section[bookname][str(i)] = { 'type':tpy, 'position':pos, 'day':day, 'week':week, 'meridiem':pmam, 'time':time, 'content':content } return section def format_out(bks, ft='MD'): """format output and write to file MARKDOWN format: TYPE | bookname | author | marktime | content --|--|--|--|-- xx|xx|xx|xx|xx CSV format: TYPE,bookname,author,marktime,content xx,xx,xx,xx,xx marktime: 20200403 PM 3:0:3 星期五 Args: bks: books dict f: can be 'MD'/'JSON'/'CSV' Returns: special format of 'bks' dict """ suff = {'MD':'.md','CSV':'.csv','JSON':'.json'} op = OUTPREF+suff[ft] with open(op, 'w', encoding='gbk', errors='ignore') as fw: if ft=='JSON': ft.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False)) elif ft=='MD': pass else: ft.write(json.dumps(bks)) # only for load back def statistic(bks): pass def dict2json(d): """convert dict to json Args: d is the dict Return: json string """ jstr = json.dumps(d) return jstr def json2dict(jf): """convert dict to json Args: jf is the file saved json string Return: dict """ d = {} with open(jf, 'r', encoding='utf8', errors='ignore') as f: d=json.load(f) return d def search_clip(bks, s, t='ALL', p='ALL'): """search clip, searching scope may be title/author/content Args: input: bks: books dict s: key word t: 'ALL' 'HL' 'BM' 'NT' p: 'ALL' 'TITLE' 'AUTHOR' 'CONTENT' Return: search clipping content """ pass if __name__=='__main__': # 4 lines for each section seperated with '=======' # so read 4 lines before '=======' with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f: books = defaultdict(dict) secd = defaultdict(dict) sidx = 0 idx = 0 sec = [] for line in f.readlines(): line = line.strip() if re.match(r'^\s*$',line): continue idx += 1 if not re.search(LASTLINE,line): # content more than 1 line if idx>3: sec[2] += str(' '+line) logger.debug('idx {} {}'.format(idx, sec[2])) else: sec.append(line) logger.debug('idx {} {}'.format(idx, sec[idx-1])) else: idx = 0 sidx += 1 # parsing section & fill data structure secd = parse_section(sec,sidx) if secd: bn = secd['bookname'] tpy = secd[bn][str(sidx)]['type'] books[bn]['author'] = secd[bn]['author'] books[bn][str(sidx)] = secd[bn][str(sidx)] if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL': books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2]) else: # BM or not correct format section sidx -= 1 # initial section for next section loop sec = [] # test dict json convert with open('./xx', 'w', encoding='utf8', errors='ignore') as fw: fw.write(dict2json(books)) if json2dict('./xx')==books: print( 'test OK') # print data with json format logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))