kindle manager

This commit is contained in:
gavin
2020-05-28 13:24:40 +08:00
parent 12a7d01ad4
commit 855d436db1

174
file2.py
View File

@@ -1,174 +0,0 @@
#############################################
## PROGRAM: file2.py
## AUTHOR: Chengan 20200526
## douboer@gmail.com
#############################################
import re
import json
from collections import defaultdict
# some constants
BOUNDARY = '==========\n'
# MACOS - /Volumes/Kindle/documents/My\ Clippings.txt
CLIPFILE = './My Clippings.txt'
STAT = 'NONE'
'''
STAT :
NONE - match nothing
START - start line of section
BM - section is a bookmark
HL - section is a highlight
NT - section is a note
'''
# data structure
'''
book = {'bookname1':
{ 'author':'chen',
'HL':
{ 'index1':
{
'position':'123-145',
'content':'xxxx',
'day':'2020年5月26日',
'week':'星期二',
'meridiem':'PM',
'time':'10:26:31'
},
'index2':
{
...
},
...
},
# 'BM': xxx skip bookmark because the content is empty
# { 'index1':
# {
# 'position':'123',
# 'day':'2020年5月26日',
# 'week':'星期二',
# 'meridiem':'PM',
# 'time':'10:26:31'
# },
# 'index2':
# {
# ...
# },
# ...
# }
'NT':
{ 'index1':
{
'position':'123',
'content':'xxxx',
'day':'2020年5月26日',
'week':'星期二',
'meridiem':'PM',
'time':'10:26:31'
},
'index2':
{
...
},
...
}
},
'bookname2':
...
}
'''
#author & bookname info
au = re.compile(
r'''
^\ufeff+
(.+) \( #bookname
(.+)\) #author
''', flags=re.X )
# page & date info
# 您在位置 #4286 的笔记 | 添加于 2020年1月30日星期四 上午10:26:31^M
# re.X(VERBOSE): 详细模式。这个模式下正则表达式可以是多行,忽略空白字符,并可以加入注释
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
da = re.compile(
r'''
\#
(\d+-{0,1}\d+) #group1 - page
.+
(笔记|标注|书签) #group2 - type
.+
(\d{4}\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日
(星期.) #group4 - week
\s
(..) #group5 - pm/am
(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time
''', flags=re.X )
with open(CLIPFILE, 'r', encoding='utf8', errors='ignore') as f:
books = defaultdict(dict)
bookname = ''
author = ''
idx = 0
num_section = 0
num_line = 0
for line in f.readlines():
idx += 1
line = line.strip()
if line.isspace(): continue
# judge whether or not RE matched
# if matched will not do an other RE search
# in order to low down the computation
flg = False
if not flg:
aus = au.search(line)
if aus:
STAT = 'START'
bookname = aus.group(1)
author = aus.group(2)
books[bookname]['author']= author
print("book:",aus.group(1),"auth:",aus.group(2))
flg = True
if not flg:
das = da.search(line)
if das:
STAT = ('HL' if das.group(2)=='标注' else ('NT' if das.group(2)=='笔记' else 'BM'))
# skip bookmark
if STAT=='BM': continue
pos = das.group(1)
day = das.group(3)
week = das.group(4)
pmam = das.group(5)
time = das.group(6)
#books[bookname][STAT] = {'idx']:idx}
books[bookname][STAT] = {idx:{'position':pos}}
books[bookname][STAT] = {idx:{'day':day}}
books[bookname][STAT] = {idx:{'week':week}}
books[bookname][STAT] = {idx:{'meridiem':pmam}}
books[bookname][STAT] = {idx:{'time':time}}
print(pos,STAT,day,week,'PM' if das.group(5)=="下午" else 'AM')
flg = True
if not flg:
# record the hightlight dict
# so if the next section is note,
# we can modify the highlight content
if STAT=='START':
pass
elif STAT=='HL':
bk_idx = idx
bk_content = line
books[bookname][STAT] = {idx:{'content':line}}
# if the section is note,
# append the note to the previous highlight content
elif STAT=='NT':
books[bookname]['HL'] = {bk_idx:{'content':bk_content+'(CG注:'+line}}
#print(json.dumps(books,indent=4,sort_keys=True,ensure_ascii=False))