diff --git a/file2.py b/file2.py deleted file mode 100644 index 605566d..0000000 --- a/file2.py +++ /dev/null @@ -1,174 +0,0 @@ - -############################################# -## PROGRAM: file2.py -## AUTHOR: Chengan 20200526 -## douboer@gmail.com -############################################# - -import re -import json -from collections import defaultdict - -# some constants -BOUNDARY = '==========\n' -# MACOS - /Volumes/Kindle/documents/My\ Clippings.txt -CLIPFILE = './My Clippings.txt' -STAT = 'NONE' - -''' -STAT : - NONE - match nothing - START - start line of section - BM - section is a bookmark - HL - section is a highlight - NT - section is a note -''' -# data structure -''' -book = {'bookname1': - { 'author':'chen', - 'HL': - { 'index1': - { - 'position':'123-145', - 'content':'xxxx', - 'day':'2020年5月26日', - 'week':'星期二', - 'meridiem':'PM', - 'time':'10:26:31' - }, - 'index2': - { - ... - }, - ... - }, -# 'BM': xxx skip bookmark because the content is empty -# { 'index1': -# { -# 'position':'123', -# 'day':'2020年5月26日', -# 'week':'星期二', -# 'meridiem':'PM', -# 'time':'10:26:31' -# }, -# 'index2': -# { -# ... -# }, -# ... -# } - 'NT': - { 'index1': - { - 'position':'123', - 'content':'xxxx', - 'day':'2020年5月26日', - 'week':'星期二', - 'meridiem':'PM', - 'time':'10:26:31' - }, - 'index2': - { - ... - }, - ... - } - }, - 'bookname2': - ... - } -''' - -#author & bookname info -au = re.compile( -r''' -^\ufeff+ -(.+) \( #bookname -(.+)\) #author -''', flags=re.X ) - -# page & date info -# 您在位置 #4286 的笔记 | 添加于 2020年1月30日星期四 上午10:26:31^M -# re.X(VERBOSE): 详细模式。这个模式下正则表达式可以是多行,忽略空白字符,并可以加入注释 -#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\) -da = re.compile( -r''' -\# -(\d+-{0,1}\d+) #group1 - page -.+ -(笔记|标注|书签) #group2 - type -.+ -(\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日 -(星期.) #group4 - week -\s -(..) #group5 - pm/am -(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time -''', flags=re.X ) - -with open(CLIPFILE, 'r', encoding='utf8', errors='ignore') as f: - books = defaultdict(dict) - bookname = '' - author = '' - idx = 0 - num_section = 0 - num_line = 0 - for line in f.readlines(): - idx += 1 - line = line.strip() - if line.isspace(): continue - - # judge whether or not RE matched - # if matched will not do an other RE search - # in order to low down the computation - flg = False - if not flg: - aus = au.search(line) - if aus: - STAT = 'START' - bookname = aus.group(1) - author = aus.group(2) - books[bookname]['author']= author - print("book:",aus.group(1),"auth:",aus.group(2)) - flg = True - - if not flg: - das = da.search(line) - if das: - STAT = ('HL' if das.group(2)=='标注' else ('NT' if das.group(2)=='笔记' else 'BM')) - - # skip bookmark - if STAT=='BM': continue - - pos = das.group(1) - day = das.group(3) - week = das.group(4) - pmam = das.group(5) - time = das.group(6) - #books[bookname][STAT] = {'idx']:idx} - books[bookname][STAT] = {idx:{'position':pos}} - books[bookname][STAT] = {idx:{'day':day}} - books[bookname][STAT] = {idx:{'week':week}} - books[bookname][STAT] = {idx:{'meridiem':pmam}} - books[bookname][STAT] = {idx:{'time':time}} - - print(pos,STAT,day,week,'PM' if das.group(5)=="下午" else 'AM') - - flg = True - - if not flg: - # record the hightlight dict - # so if the next section is note, - # we can modify the highlight content - if STAT=='START': - pass - elif STAT=='HL': - bk_idx = idx - bk_content = line - books[bookname][STAT] = {idx:{'content':line}} - # if the section is note, - # append the note to the previous highlight content - elif STAT=='NT': - books[bookname]['HL'] = {bk_idx:{'content':bk_content+'(CG注:'+line}} - -#print(json.dumps(books,indent=4,sort_keys=True,ensure_ascii=False))