kman/file2.py


#############################################
##   PROGRAM: file2.py
##   AUTHOR:  Chengan 20200526
##            douboer@gmail.com
#############################################

import re
import json
from collections import defaultdict

# some constants
BOUNDARY = '==========\n'
# MACOS - /Volumes/Kindle/documents/My\ Clippings.txt
CLIPFILE = './My Clippings.txt'
STAT     = 'NONE'

'''
STAT :
    NONE   - match nothing
    START  - start line of section
    BM     - section is a bookmark
    HL     - section is a highlight
    NT     - section is a note
'''
# data structure
'''
book = {'bookname1':
           { 'author':'chen',
             'HL':
                 { 'index1':
                     {
                         'position':'123-145',
                         'content':'xxxx',
                         'day':'2020年5月26日',
                         'week':'星期二',
                         'meridiem':'PM',
                         'time':'10:26:31'
                     },
                     'index2':
                     {
                     ...
                     },
                     ...
                 },
#            'BM':   xxx skip bookmark because the content is empty
#                { 'index1':
#                    {
#                        'position':'123',
#                        'day':'2020年5月26日',
#                        'week':'星期二',
#                        'meridiem':'PM',
#                        'time':'10:26:31'
#                    },
#                    'index2':
#                    {
#                    ...
#                    },
#                    ...
#                }
             'NT':
                 { 'index1':
                     {
                         'position':'123',
                         'content':'xxxx',
                         'day':'2020年5月26日',
                         'week':'星期二',
                         'meridiem':'PM',
                         'time':'10:26:31'
                     },
                     'index2':
                     {
                     ...
                     },
                     ...
                 }
            },
         'bookname2':
           ...
       }
'''

#author & bookname info
au = re.compile(
r'''
^\ufeff+
(.+) \(         #bookname
(.+)\)             #author
''', flags=re.X )

# page & date info
# 您在位置 #4286 的笔记 | 添加于 2020年1月30日星期四 上午10:26:31^M
# re.X(VERBOSE): 详细模式。这个模式下正则表达式可以是多行，忽略空白字符，并可以加入注释
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
da = re.compile(
r'''
\#
(\d+-{0,1}\d+)                 #group1 - page
.+
(笔记|标注|书签)               #group2 - type
.+
(\d{4}年\d{1,2}月\d{1,2}日)    #group3 - xxxx年xx月xx日
(星期.)                        #group4 - week
\s
(..)                           #group5 - pm/am
(\d{1,2}:\d{1,2}:\d{1,2})      #group6 - time
''', flags=re.X )

with open(CLIPFILE, 'r', encoding='utf8', errors='ignore') as f:
    books = defaultdict(dict)
    bookname    = ''
    author      = ''
    idx         = 0
    num_section = 0
    num_line    = 0
    for line in f.readlines():
        idx += 1
        line = line.strip()
        if line.isspace(): continue

        # judge whether or not RE matched
        # if matched will not do an other RE search
        # in order to low down the computation
        flg = False
        if not flg:
            aus = au.search(line)
            if aus:
                STAT = 'START'
                bookname = aus.group(1)
                author   = aus.group(2)
                books[bookname]['author']= author
                print("book:",aus.group(1),"auth:",aus.group(2))
                flg = True

        if not flg:
            das = da.search(line)
            if das:
                STAT = ('HL' if das.group(2)=='标注' else ('NT' if das.group(2)=='笔记' else 'BM'))

                # skip bookmark
                if STAT=='BM': continue

                pos   = das.group(1)
                day   = das.group(3)
                week  = das.group(4)
                pmam  = das.group(5)
                time  = das.group(6)
                #books[bookname][STAT] = {'idx']:idx}
                books[bookname][STAT] = {idx:{'position':pos}}
                books[bookname][STAT] = {idx:{'day':day}}
                books[bookname][STAT] = {idx:{'week':week}}
                books[bookname][STAT] = {idx:{'meridiem':pmam}}
                books[bookname][STAT] = {idx:{'time':time}}

                print(pos,STAT,day,week,'PM' if das.group(5)=="下午" else 'AM')

                flg = True

        if not flg:
            # record the hightlight dict
            # so if the next section is note,
            # we can modify the highlight content
            if STAT=='START':
                pass
            elif STAT=='HL':
                bk_idx     = idx
                bk_content = line
                books[bookname][STAT] = {idx:{'content':line}}
            # if the section is note,
            # append the note to the previous highlight content
            elif STAT=='NT':
                books[bookname]['HL'] = {bk_idx:{'content':bk_content+'(CG注:'+line}}

#print(json.dumps(books,indent=4,sort_keys=True,ensure_ascii=False))