kindle manager
This commit is contained in:
174
file2.py
174
file2.py
@@ -1,174 +0,0 @@
|
||||
|
||||
#############################################
|
||||
## PROGRAM: file2.py
|
||||
## AUTHOR: Chengan 20200526
|
||||
## douboer@gmail.com
|
||||
#############################################
|
||||
|
||||
import re
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
# some constants
|
||||
BOUNDARY = '==========\n'
|
||||
# MACOS - /Volumes/Kindle/documents/My\ Clippings.txt
|
||||
CLIPFILE = './My Clippings.txt'
|
||||
STAT = 'NONE'
|
||||
|
||||
'''
|
||||
STAT :
|
||||
NONE - match nothing
|
||||
START - start line of section
|
||||
BM - section is a bookmark
|
||||
HL - section is a highlight
|
||||
NT - section is a note
|
||||
'''
|
||||
# data structure
|
||||
'''
|
||||
book = {'bookname1':
|
||||
{ 'author':'chen',
|
||||
'HL':
|
||||
{ 'index1':
|
||||
{
|
||||
'position':'123-145',
|
||||
'content':'xxxx',
|
||||
'day':'2020年5月26日',
|
||||
'week':'星期二',
|
||||
'meridiem':'PM',
|
||||
'time':'10:26:31'
|
||||
},
|
||||
'index2':
|
||||
{
|
||||
...
|
||||
},
|
||||
...
|
||||
},
|
||||
# 'BM': xxx skip bookmark because the content is empty
|
||||
# { 'index1':
|
||||
# {
|
||||
# 'position':'123',
|
||||
# 'day':'2020年5月26日',
|
||||
# 'week':'星期二',
|
||||
# 'meridiem':'PM',
|
||||
# 'time':'10:26:31'
|
||||
# },
|
||||
# 'index2':
|
||||
# {
|
||||
# ...
|
||||
# },
|
||||
# ...
|
||||
# }
|
||||
'NT':
|
||||
{ 'index1':
|
||||
{
|
||||
'position':'123',
|
||||
'content':'xxxx',
|
||||
'day':'2020年5月26日',
|
||||
'week':'星期二',
|
||||
'meridiem':'PM',
|
||||
'time':'10:26:31'
|
||||
},
|
||||
'index2':
|
||||
{
|
||||
...
|
||||
},
|
||||
...
|
||||
}
|
||||
},
|
||||
'bookname2':
|
||||
...
|
||||
}
|
||||
'''
|
||||
|
||||
#author & bookname info
|
||||
au = re.compile(
|
||||
r'''
|
||||
^\ufeff+
|
||||
(.+) \( #bookname
|
||||
(.+)\) #author
|
||||
''', flags=re.X )
|
||||
|
||||
# page & date info
|
||||
# 您在位置 #4286 的笔记 | 添加于 2020年1月30日星期四 上午10:26:31^M
|
||||
# re.X(VERBOSE): 详细模式。这个模式下正则表达式可以是多行,忽略空白字符,并可以加入注释
|
||||
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
|
||||
da = re.compile(
|
||||
r'''
|
||||
\#
|
||||
(\d+-{0,1}\d+) #group1 - page
|
||||
.+
|
||||
(笔记|标注|书签) #group2 - type
|
||||
.+
|
||||
(\d{4}年\d{1,2}月\d{1,2}日) #group3 - xxxx年xx月xx日
|
||||
(星期.) #group4 - week
|
||||
\s
|
||||
(..) #group5 - pm/am
|
||||
(\d{1,2}:\d{1,2}:\d{1,2}) #group6 - time
|
||||
''', flags=re.X )
|
||||
|
||||
with open(CLIPFILE, 'r', encoding='utf8', errors='ignore') as f:
|
||||
books = defaultdict(dict)
|
||||
bookname = ''
|
||||
author = ''
|
||||
idx = 0
|
||||
num_section = 0
|
||||
num_line = 0
|
||||
for line in f.readlines():
|
||||
idx += 1
|
||||
line = line.strip()
|
||||
if line.isspace(): continue
|
||||
|
||||
# judge whether or not RE matched
|
||||
# if matched will not do an other RE search
|
||||
# in order to low down the computation
|
||||
flg = False
|
||||
if not flg:
|
||||
aus = au.search(line)
|
||||
if aus:
|
||||
STAT = 'START'
|
||||
bookname = aus.group(1)
|
||||
author = aus.group(2)
|
||||
books[bookname]['author']= author
|
||||
print("book:",aus.group(1),"auth:",aus.group(2))
|
||||
flg = True
|
||||
|
||||
if not flg:
|
||||
das = da.search(line)
|
||||
if das:
|
||||
STAT = ('HL' if das.group(2)=='标注' else ('NT' if das.group(2)=='笔记' else 'BM'))
|
||||
|
||||
# skip bookmark
|
||||
if STAT=='BM': continue
|
||||
|
||||
pos = das.group(1)
|
||||
day = das.group(3)
|
||||
week = das.group(4)
|
||||
pmam = das.group(5)
|
||||
time = das.group(6)
|
||||
#books[bookname][STAT] = {'idx']:idx}
|
||||
books[bookname][STAT] = {idx:{'position':pos}}
|
||||
books[bookname][STAT] = {idx:{'day':day}}
|
||||
books[bookname][STAT] = {idx:{'week':week}}
|
||||
books[bookname][STAT] = {idx:{'meridiem':pmam}}
|
||||
books[bookname][STAT] = {idx:{'time':time}}
|
||||
|
||||
print(pos,STAT,day,week,'PM' if das.group(5)=="下午" else 'AM')
|
||||
|
||||
flg = True
|
||||
|
||||
if not flg:
|
||||
# record the hightlight dict
|
||||
# so if the next section is note,
|
||||
# we can modify the highlight content
|
||||
if STAT=='START':
|
||||
pass
|
||||
elif STAT=='HL':
|
||||
bk_idx = idx
|
||||
bk_content = line
|
||||
books[bookname][STAT] = {idx:{'content':line}}
|
||||
# if the section is note,
|
||||
# append the note to the previous highlight content
|
||||
elif STAT=='NT':
|
||||
books[bookname]['HL'] = {bk_idx:{'content':bk_content+'(CG注:'+line}}
|
||||
|
||||
#print(json.dumps(books,indent=4,sort_keys=True,ensure_ascii=False))
|
||||
Reference in New Issue
Block a user