############################################# ## PROGRAM: file2.py ## AUTHOR: Chengan 20200526 ## douboer@gmail.com ############################################# import re import json from collections import defaultdict # some constants BOUNDARY = '==========\n' # MACOS - /Volumes/Kindle/documents/My\ Clippings.txt CLIPFILE = './My Clippings.txt' STAT = 'NONE' ''' STAT : NONE - match nothing START - start line of section BM - section is a bookmark HL - section is a highlight NT - section is a note ''' # data structure ''' book = {'bookname1': { 'author':'chen', 'HL': { 'index1': { 'position':'123-145', 'content':'xxxx', 'day':'2020年5月26日', 'week':'星期二', 'meridiem':'PM', 'time':'10:26:31' }, 'index2': { ... }, ... }, # 'BM': xxx skip bookmark because the content is empty # { 'index1': # { # 'position':'123', # 'day':'2020年5月26日', # 'week':'星期二', # 'meridiem':'PM', # 'time':'10:26:31' # }, # 'index2': # { # ... # }, # ... # } 'NT': { 'index1': { 'position':'123', 'content':'xxxx', 'day':'2020年5月26日', 'week':'星期二', 'meridiem':'PM', 'time':'10:26:31' }, 'index2': { ... }, ... } }, 'bookname2': ... } ''' #author & bookname info au = re.compile( r''' ^\ufeff+ (.+) \( #bookname (.+)\) #author ''', flags=re.X ) # page & date info # 您在位置 #4286 的笔记 | 添加于 2020年1月30日星期四 上午10:26:31^M # re.X(VERBOSE): 详细模式。这个模式下正则表达式可以是多行,忽略空白字符,并可以加入注释 #\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\) da = re.compile( r''' \# (\d+-{0,1}\d+) #group1 - page .+ (笔记|标注|书签) #group2 - type .+ (\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日 (星期.) #group4 - week \s (..) #group5 - pm/am (\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time ''', flags=re.X ) with open(CLIPFILE, 'r', encoding='utf8', errors='ignore') as f: books = defaultdict(dict) bookname = '' author = '' idx = 0 num_section = 0 num_line = 0 for line in f.readlines(): idx += 1 line = line.strip() if line.isspace(): continue # judge whether or not RE matched # if matched will not do an other RE search # in order to low down the computation flg = False if not flg: aus = au.search(line) if aus: STAT = 'START' bookname = aus.group(1) author = aus.group(2) books[bookname]['author']= author print("book:",aus.group(1),"auth:",aus.group(2)) flg = True if not flg: das = da.search(line) if das: STAT = ('HL' if das.group(2)=='标注' else ('NT' if das.group(2)=='笔记' else 'BM')) # skip bookmark if STAT=='BM': continue pos = das.group(1) day = das.group(3) week = das.group(4) pmam = das.group(5) time = das.group(6) #books[bookname][STAT] = {'idx']:idx} books[bookname][STAT] = {idx:{'position':pos}} books[bookname][STAT] = {idx:{'day':day}} books[bookname][STAT] = {idx:{'week':week}} books[bookname][STAT] = {idx:{'meridiem':pmam}} books[bookname][STAT] = {idx:{'time':time}} print(pos,STAT,day,week,'PM' if das.group(5)=="下午" else 'AM') flg = True if not flg: # record the hightlight dict # so if the next section is note, # we can modify the highlight content if STAT=='START': pass elif STAT=='HL': bk_idx = idx bk_content = line books[bookname][STAT] = {idx:{'content':line}} # if the section is note, # append the note to the previous highlight content elif STAT=='NT': books[bookname]['HL'] = {bk_idx:{'content':bk_content+'(CG注:'+line}} #print(json.dumps(books,indent=4,sort_keys=True,ensure_ascii=False))