175 lines
5.3 KiB
Python
175 lines
5.3 KiB
Python
|
|
#############################################
|
|
## PROGRAM: file2.py
|
|
## AUTHOR: Chengan 20200526
|
|
## douboer@gmail.com
|
|
#############################################
|
|
|
|
import re
|
|
import json
|
|
from collections import defaultdict
|
|
|
|
# some constants
|
|
BOUNDARY = '==========\n'
|
|
# MACOS - /Volumes/Kindle/documents/My\ Clippings.txt
|
|
CLIPFILE = './My Clippings.txt'
|
|
STAT = 'NONE'
|
|
|
|
'''
|
|
STAT :
|
|
NONE - match nothing
|
|
START - start line of section
|
|
BM - section is a bookmark
|
|
HL - section is a highlight
|
|
NT - section is a note
|
|
'''
|
|
# data structure
|
|
'''
|
|
book = {'bookname1':
|
|
{ 'author':'chen',
|
|
'HL':
|
|
{ 'index1':
|
|
{
|
|
'position':'123-145',
|
|
'content':'xxxx',
|
|
'day':'2020年5月26日',
|
|
'week':'星期二',
|
|
'meridiem':'PM',
|
|
'time':'10:26:31'
|
|
},
|
|
'index2':
|
|
{
|
|
...
|
|
},
|
|
...
|
|
},
|
|
# 'BM': xxx skip bookmark because the content is empty
|
|
# { 'index1':
|
|
# {
|
|
# 'position':'123',
|
|
# 'day':'2020年5月26日',
|
|
# 'week':'星期二',
|
|
# 'meridiem':'PM',
|
|
# 'time':'10:26:31'
|
|
# },
|
|
# 'index2':
|
|
# {
|
|
# ...
|
|
# },
|
|
# ...
|
|
# }
|
|
'NT':
|
|
{ 'index1':
|
|
{
|
|
'position':'123',
|
|
'content':'xxxx',
|
|
'day':'2020年5月26日',
|
|
'week':'星期二',
|
|
'meridiem':'PM',
|
|
'time':'10:26:31'
|
|
},
|
|
'index2':
|
|
{
|
|
...
|
|
},
|
|
...
|
|
}
|
|
},
|
|
'bookname2':
|
|
...
|
|
}
|
|
'''
|
|
|
|
#author & bookname info
|
|
au = re.compile(
|
|
r'''
|
|
^\ufeff+
|
|
(.+) \( #bookname
|
|
(.+)\) #author
|
|
''', flags=re.X )
|
|
|
|
# page & date info
|
|
# 您在位置 #4286 的笔记 | 添加于 2020年1月30日星期四 上午10:26:31^M
|
|
# re.X(VERBOSE): 详细模式。这个模式下正则表达式可以是多行,忽略空白字符,并可以加入注释
|
|
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
|
|
da = re.compile(
|
|
r'''
|
|
\#
|
|
(\d+-{0,1}\d+) #group1 - page
|
|
.+
|
|
(笔记|标注|书签) #group2 - type
|
|
.+
|
|
(\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日
|
|
(星期.) #group4 - week
|
|
\s
|
|
(..) #group5 - pm/am
|
|
(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time
|
|
''', flags=re.X )
|
|
|
|
with open(CLIPFILE, 'r', encoding='utf8', errors='ignore') as f:
|
|
books = defaultdict(dict)
|
|
bookname = ''
|
|
author = ''
|
|
idx = 0
|
|
num_section = 0
|
|
num_line = 0
|
|
for line in f.readlines():
|
|
idx += 1
|
|
line = line.strip()
|
|
if line.isspace(): continue
|
|
|
|
# judge whether or not RE matched
|
|
# if matched will not do an other RE search
|
|
# in order to low down the computation
|
|
flg = False
|
|
if not flg:
|
|
aus = au.search(line)
|
|
if aus:
|
|
STAT = 'START'
|
|
bookname = aus.group(1)
|
|
author = aus.group(2)
|
|
books[bookname]['author']= author
|
|
print("book:",aus.group(1),"auth:",aus.group(2))
|
|
flg = True
|
|
|
|
if not flg:
|
|
das = da.search(line)
|
|
if das:
|
|
STAT = ('HL' if das.group(2)=='标注' else ('NT' if das.group(2)=='笔记' else 'BM'))
|
|
|
|
# skip bookmark
|
|
if STAT=='BM': continue
|
|
|
|
pos = das.group(1)
|
|
day = das.group(3)
|
|
week = das.group(4)
|
|
pmam = das.group(5)
|
|
time = das.group(6)
|
|
#books[bookname][STAT] = {'idx']:idx}
|
|
books[bookname][STAT] = {idx:{'position':pos}}
|
|
books[bookname][STAT] = {idx:{'day':day}}
|
|
books[bookname][STAT] = {idx:{'week':week}}
|
|
books[bookname][STAT] = {idx:{'meridiem':pmam}}
|
|
books[bookname][STAT] = {idx:{'time':time}}
|
|
|
|
print(pos,STAT,day,week,'PM' if das.group(5)=="下午" else 'AM')
|
|
|
|
flg = True
|
|
|
|
if not flg:
|
|
# record the hightlight dict
|
|
# so if the next section is note,
|
|
# we can modify the highlight content
|
|
if STAT=='START':
|
|
pass
|
|
elif STAT=='HL':
|
|
bk_idx = idx
|
|
bk_content = line
|
|
books[bookname][STAT] = {idx:{'content':line}}
|
|
# if the section is note,
|
|
# append the note to the previous highlight content
|
|
elif STAT=='NT':
|
|
books[bookname]['HL'] = {bk_idx:{'content':bk_content+'(CG注:'+line}}
|
|
|
|
#print(json.dumps(books,indent=4,sort_keys=True,ensure_ascii=False))
|