############################################# ## PROGRAM: file2.py ## AUTHOR: Chengan ## CREATE: 20200526 ## douboer@gmail.com ############################################# import platform import re import json import logging from collections import defaultdict # data structure - use dict ''' books = { "bookname_xxx": { "author": "李", "section1636": { "content": "张", "day": "2020年4月3日", "meridiem": "下午", "position": "311-311", "time": "3:00:53", "type": "HL", "week": "星期五" }, "section1651": { "content": "治", "day": "2020年4月3日", "meridiem": "下午", "position": "514", "time": "3:43:50", "type": "NT", "week": "星期五" }, "section1814": { "content": null, "day": "2020年4月12日", "meridiem": "下午", "position": "5186", "time": "2:20:12", "type": "BM", "week": "星期日" }, ... }, ... } ''' # modi clippath for different os SYS = 'WIN' if platform.system()=='Windows' else \ ('LINUX' if platform.system()=='LINUX' else 'MAC') # some constants LASTLINE = '==========' NTPREF = '--CG注:' CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt STAT = 'NONE' DEBUG = 1 # 0 - INFO; 1 - DEBUG LOG2FILE = 1 # 0 - to stdio; 1 - to file # log info logger = logging.getLogger() #formatter = logging.Formatter # ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s') formatter = logging.Formatter('') if LOG2FILE: handler = logging.FileHandler("log") handler.setFormatter(formatter) logger.addHandler(handler) else: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) if DEBUG: logger.setLevel(logging.DEBUG) #author & bookname info au = re.compile( r''' ^\ufeff* (.+) \( #bookname (.+)\) #author ''', flags=re.X ) # page & date info #\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\) da = re.compile( r''' \# (\d+-{0,1}\d+) #group1 - page .+ (笔记|标注|书签) #group2 - type .+ (\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日 (星期.) #group4 - week \s (..) #group5 - pm/am (\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time ''', flags=re.X ) # input: section dict & and section index # return: dict # d = { 'bookname':bookname, # bookname: { # 'author':author # 'section0':{ # 'type':'HL', # 'position':'123', # 'day':'2020年5月26日', # 'week':'星期二', # 'meridiem':'PM', # 'time':'10:26:31' # 'content':content }}} def parse_section(sec,idx): # 1. highlight over the picture, the content(#3 line) is empty, only two lines # 2. bookmark section only two lines # 3. other not correct format < 2 if len(sec)<=2: return False # parse #2 line section = defaultdict(dict) authinfo = sec[0] dateinfo = sec[1] content = sec[2] if len(sec)==3 else None das = da.search(dateinfo) # type of section ''' STAT : START - start line of section BM - section is a bookmark HL - section is a highlight NT - section is a note ''' tpy = ('HL' if das.group(2)=='标注' else \ ('NT' if das.group(2)=='笔记' else 'BM')) pos = das.group(1) day = das.group(3) week = das.group(4) pmam = das.group(5) time = das.group(6) # parse #1 line aus = au.search(authinfo) bookname = aus.group(1) author = aus.group(2) section[bookname]['author'] = author section['bookname'] = bookname section[bookname][str(idx)] = { 'type':tpy, 'position':pos, 'day':day, 'week':week, 'meridiem':pmam, 'time':time, 'content':content } return section # format output # input: books - dict # f - 'MD' # 'TXT' # 'JSON' # output: # def formmat_out(books,f='MD'): pass # search clip, searching scope may be title/author/content # input: books - dict # s - key word # t - 'ALL' # 'HL' # 'BM' # 'NT' # p - 'ALL' # 'TITLE' # 'AUTHOR' # 'CONTENT' # output: # def search_clip(books, s, t='ALL', p='ALL'): pass if __name__ == '__main__': # 4 lines for each section seperated with '=======' # so read 4 lines before '=======' with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f: books = defaultdict(dict) secd = defaultdict(dict) sidx = 0 idx = 0 sec = [] STAT = 'START' for line in f.readlines(): line = line.strip() if re.match(r'^\s*$',line): continue idx += 1 if not re.search(LASTLINE,line): # content more than 1 line if idx>3: sec[2] += str(' '+line) logger.debug('idx {} {}'.format(idx, sec[2])) else: sec.append(line) logger.debug('idx {} {}'.format(idx, sec[idx-1])) else: idx = 0 sidx += 1 # parsing section & fill data structure secd = parse_section(sec,sidx) if secd: bn = secd['bookname'] tpy = secd[bn][str(sidx)]['type'] books[bn]['author'] = secd[bn]['author'] books[bn][str(sidx)] = secd[bn][str(sidx)] if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL': books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2]) else: # BM or not correct format section sidx -= 1 # initial section for next section loop sec = [] # print data with json format logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))