diff --git a/kread.py b/kread.py deleted file mode 100644 index 57e598e..0000000 --- a/kread.py +++ /dev/null @@ -1,240 +0,0 @@ - -############################################# -## PROGRAM: kman.py -## AUTHOR: Chengan -## CREATE: 20200526 -## douboer@gmail.com -############################################# - -import platform -import re -import json -import logging -from collections import defaultdict - -# data structure - use dict -''' -books = -{ - "bookname_xxx": { - "author": "李", - "section1636": { - "content": "张", - "day": "2020年4月3日", - "meridiem": "下午", - "position": "311-311", - "time": "3:00:53", - "type": "HL", - "week": "星期五" - }, - "section1651": { - "content": "治", - "day": "2020年4月3日", - "meridiem": "下午", - "position": "514", - "time": "3:43:50", - "type": "NT", - "week": "星期五" - }, - "section1814": { - "content": null, - "day": "2020年4月12日", - "meridiem": "下午", - "position": "5186", - "time": "2:20:12", - "type": "BM", - "week": "星期日" - }, - ... - }, - ... -} -''' - -# modi clippath for different os -SYS = 'WIN' if platform.system()=='Windows' else \ - ('LINUX' if platform.system()=='LINUX' else 'MAC') - -# some constants -LASTLINE = '==========' -NTPREF = '--CG注:' -CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt -STAT = 'NONE' -DEBUG = 1 # 0 - INFO; 1 - DEBUG -LOG2FILE = 1 # 0 - to stdio; 1 - to file - -# log info -logger = logging.getLogger() -#formatter = logging.Formatter -# ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s') -formatter = logging.Formatter('') - -if LOG2FILE: - handler = logging.FileHandler("log") - handler.setFormatter(formatter) - logger.addHandler(handler) -else: - logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) - -if DEBUG: - logger.setLevel(logging.DEBUG) - -#author & bookname info -au = re.compile( -r''' -^\ufeff* -(.+) \( #bookname -(.+)\) #author -''', flags=re.X ) - -# page & date info -#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\) -da = re.compile( -r''' -\# -(\d+-{0,1}\d+) #group1 - page -.+ -(笔记|标注|书签) #group2 - type -.+ -(\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日 -(星期.) #group4 - week -\s -(..) #group5 - pm/am -(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time -''', flags=re.X ) - -# input: section dict & and section index -# return: dict -# d = { 'bookname':bookname, -# bookname: { -# 'author':author -# 'section0':{ -# 'type':'HL', -# 'position':'123', -# 'day':'2020年5月26日', -# 'week':'星期二', -# 'meridiem':'PM', -# 'time':'10:26:31' -# 'content':content }}} -def parse_section(sec,idx): - # 1. highlight over the picture, the content(#3 line) is empty, only two lines - # 2. bookmark section only two lines - # 3. other not correct format < 2 - if len(sec)<=2: - return False - - # parse #2 line - section = defaultdict(dict) - authinfo = sec[0] - dateinfo = sec[1] - content = sec[2] if len(sec)==3 else None - - das = da.search(dateinfo) - # type of section - ''' - STAT : - START - start line of section - BM - section is a bookmark - HL - section is a highlight - NT - section is a note - ''' - tpy = ('HL' if das.group(2)=='标注' else \ - ('NT' if das.group(2)=='笔记' else 'BM')) - pos = das.group(1) - day = das.group(3) - week = das.group(4) - pmam = das.group(5) - time = das.group(6) - - # parse #1 line - aus = au.search(authinfo) - bookname = aus.group(1) - author = aus.group(2) - section[bookname]['author'] = author - - section['bookname'] = bookname - section[bookname][str(idx)] = { - 'type':tpy, - 'position':pos, - 'day':day, - 'week':week, - 'meridiem':pmam, - 'time':time, - 'content':content } - - return section - -# format output -# input: books - dict -# f - 'MD' -# 'TXT' -# 'JSON' -# output: -# -def formmat_out(books,f='MD'): - pass - -# search clip, searching scope may be title/author/content -# input: books - dict -# s - key word -# t - 'ALL' -# 'HL' -# 'BM' -# 'NT' -# p - 'ALL' -# 'TITLE' -# 'AUTHOR' -# 'CONTENT' -# output: -# -def search_clip(books, s, t='ALL', p='ALL'): - pass - -if __name__ == '__main__': - # 4 lines for each section seperated with '=======' - # so read 4 lines before '=======' - with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f: - books = defaultdict(dict) - secd = defaultdict(dict) - sidx = 0 - idx = 0 - sec = [] - STAT = 'START' - for line in f.readlines(): - line = line.strip() - if re.match(r'^\s*$',line): continue - idx += 1 - - if not re.search(LASTLINE,line): - # content more than 1 line - if idx>3: - sec[2] += str(' '+line) - logger.debug('idx {} {}'.format(idx, sec[2])) - else: - sec.append(line) - logger.debug('idx {} {}'.format(idx, sec[idx-1])) - else: - idx = 0 - sidx += 1 - - # parsing section & fill data structure - secd = parse_section(sec,sidx) - - if secd: - bn = secd['bookname'] - tpy = secd[bn][str(sidx)]['type'] - - books[bn]['author'] = secd[bn]['author'] - books[bn][str(sidx)] = secd[bn][str(sidx)] - - if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL': - books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2]) - else: # BM or not correct format section - sidx -= 1 - - # initial section for next section loop - sec = [] - - # print data with json format - logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False)) -