diff --git a/kread.py b/kread.py new file mode 100644 index 0000000..57e598e --- /dev/null +++ b/kread.py @@ -0,0 +1,240 @@ + +############################################# +## PROGRAM: kman.py +## AUTHOR: Chengan +## CREATE: 20200526 +## douboer@gmail.com +############################################# + +import platform +import re +import json +import logging +from collections import defaultdict + +# data structure - use dict +''' +books = +{ + "bookname_xxx": { + "author": "李", + "section1636": { + "content": "张", + "day": "2020年4月3日", + "meridiem": "下午", + "position": "311-311", + "time": "3:00:53", + "type": "HL", + "week": "星期五" + }, + "section1651": { + "content": "治", + "day": "2020年4月3日", + "meridiem": "下午", + "position": "514", + "time": "3:43:50", + "type": "NT", + "week": "星期五" + }, + "section1814": { + "content": null, + "day": "2020年4月12日", + "meridiem": "下午", + "position": "5186", + "time": "2:20:12", + "type": "BM", + "week": "星期日" + }, + ... + }, + ... +} +''' + +# modi clippath for different os +SYS = 'WIN' if platform.system()=='Windows' else \ + ('LINUX' if platform.system()=='LINUX' else 'MAC') + +# some constants +LASTLINE = '==========' +NTPREF = '--CG注:' +CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt +STAT = 'NONE' +DEBUG = 1 # 0 - INFO; 1 - DEBUG +LOG2FILE = 1 # 0 - to stdio; 1 - to file + +# log info +logger = logging.getLogger() +#formatter = logging.Formatter +# ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s') +formatter = logging.Formatter('') + +if LOG2FILE: + handler = logging.FileHandler("log") + handler.setFormatter(formatter) + logger.addHandler(handler) +else: + logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) + +if DEBUG: + logger.setLevel(logging.DEBUG) + +#author & bookname info +au = re.compile( +r''' +^\ufeff* +(.+) \( #bookname +(.+)\) #author +''', flags=re.X ) + +# page & date info +#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\) +da = re.compile( +r''' +\# +(\d+-{0,1}\d+) #group1 - page +.+ +(笔记|标注|书签) #group2 - type +.+ +(\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日 +(星期.) #group4 - week +\s +(..) #group5 - pm/am +(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time +''', flags=re.X ) + +# input: section dict & and section index +# return: dict +# d = { 'bookname':bookname, +# bookname: { +# 'author':author +# 'section0':{ +# 'type':'HL', +# 'position':'123', +# 'day':'2020年5月26日', +# 'week':'星期二', +# 'meridiem':'PM', +# 'time':'10:26:31' +# 'content':content }}} +def parse_section(sec,idx): + # 1. highlight over the picture, the content(#3 line) is empty, only two lines + # 2. bookmark section only two lines + # 3. other not correct format < 2 + if len(sec)<=2: + return False + + # parse #2 line + section = defaultdict(dict) + authinfo = sec[0] + dateinfo = sec[1] + content = sec[2] if len(sec)==3 else None + + das = da.search(dateinfo) + # type of section + ''' + STAT : + START - start line of section + BM - section is a bookmark + HL - section is a highlight + NT - section is a note + ''' + tpy = ('HL' if das.group(2)=='标注' else \ + ('NT' if das.group(2)=='笔记' else 'BM')) + pos = das.group(1) + day = das.group(3) + week = das.group(4) + pmam = das.group(5) + time = das.group(6) + + # parse #1 line + aus = au.search(authinfo) + bookname = aus.group(1) + author = aus.group(2) + section[bookname]['author'] = author + + section['bookname'] = bookname + section[bookname][str(idx)] = { + 'type':tpy, + 'position':pos, + 'day':day, + 'week':week, + 'meridiem':pmam, + 'time':time, + 'content':content } + + return section + +# format output +# input: books - dict +# f - 'MD' +# 'TXT' +# 'JSON' +# output: +# +def formmat_out(books,f='MD'): + pass + +# search clip, searching scope may be title/author/content +# input: books - dict +# s - key word +# t - 'ALL' +# 'HL' +# 'BM' +# 'NT' +# p - 'ALL' +# 'TITLE' +# 'AUTHOR' +# 'CONTENT' +# output: +# +def search_clip(books, s, t='ALL', p='ALL'): + pass + +if __name__ == '__main__': + # 4 lines for each section seperated with '=======' + # so read 4 lines before '=======' + with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f: + books = defaultdict(dict) + secd = defaultdict(dict) + sidx = 0 + idx = 0 + sec = [] + STAT = 'START' + for line in f.readlines(): + line = line.strip() + if re.match(r'^\s*$',line): continue + idx += 1 + + if not re.search(LASTLINE,line): + # content more than 1 line + if idx>3: + sec[2] += str(' '+line) + logger.debug('idx {} {}'.format(idx, sec[2])) + else: + sec.append(line) + logger.debug('idx {} {}'.format(idx, sec[idx-1])) + else: + idx = 0 + sidx += 1 + + # parsing section & fill data structure + secd = parse_section(sec,sidx) + + if secd: + bn = secd['bookname'] + tpy = secd[bn][str(sidx)]['type'] + + books[bn]['author'] = secd[bn]['author'] + books[bn][str(sidx)] = secd[bn][str(sidx)] + + if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL': + books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2]) + else: # BM or not correct format section + sidx -= 1 + + # initial section for next section loop + sec = [] + + # print data with json format + logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False)) +