kindle manager

2020-05-28 13:19:51 +08:00
parent 1aee17aa64
commit 12a7d01ad4
1 changed files with 240 additions and 0 deletions
--- a/kread.py
+++ b/kread.py
@@ -0,0 +1,240 @@
 #############################################
 ##   PROGRAM: kman.py
 ##   AUTHOR:  Chengan
 ##   CREATE:  20200526
 ##   douboer@gmail.com
 #############################################
 import platform
 import re
 import json
 import logging
 from collections import defaultdict
 # data structure - use dict
 '''
 books =
 {
    "bookname_xxx": {
        "author": "李",
        "section1636": {
            "content": "张",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "311-311",
            "time": "3:00:53",
            "type": "HL",
            "week": "星期五"
        },
        "section1651": {
            "content": "治",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "514",
            "time": "3:43:50",
            "type": "NT",
            "week": "星期五"
        },
        "section1814": {
            "content": null,
            "day": "2020年4月12日",
            "meridiem": "下午",
            "position": "5186",
            "time": "2:20:12",
            "type": "BM",
            "week": "星期日"
        },
        ...
    },
    ...
 }
 '''
 # modi clippath for different os
 SYS = 'WIN' if platform.system()=='Windows' else \
   ('LINUX' if platform.system()=='LINUX' else 'MAC')
 # some constants
 LASTLINE = '=========='
 NTPREF   = '--CG注:'
 CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
 STAT     = 'NONE'
 DEBUG    = 1   # 0 - INFO; 1 - DEBUG
 LOG2FILE = 1   # 0 - to stdio; 1 - to file
 # log info
 logger = logging.getLogger()
 #formatter = logging.Formatter
 #    ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
 formatter = logging.Formatter('')
 if LOG2FILE:
    handler = logging.FileHandler("log")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
 else:
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
 if DEBUG:
    logger.setLevel(logging.DEBUG)
 #author & bookname info
 au = re.compile(
 r'''
 ^\ufeff*
 (.+) \(         #bookname
 (.+)\)          #author
 ''', flags=re.X )
 # page & date info
 #\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
 da = re.compile(
 r'''
 \#
 (\d+-{0,1}\d+)                 #group1 - page
 .+
 (笔记|标注|书签)               #group2 - type
 .+
 (\d{4}年\d{1,2}月\d{1,2}日)    #group3 - xxxx年xx月xx日
 (星期.)                        #group4 - week
 \s
 (..)                           #group5 - pm/am
 (\d{1,2}:\d{1,2}:\d{1,2})      #group6 - time
 ''', flags=re.X )
 # input: section dict & and section index
 # return: dict
 #   d = { 'bookname':bookname,
 #          bookname: {
 #             'author':author
 #             'section0':{
 #                 'type':'HL',
 #                 'position':'123',
 #                 'day':'2020年5月26日',
 #                 'week':'星期二',
 #                 'meridiem':'PM',
 #                 'time':'10:26:31'
 #                 'content':content }}}
 def parse_section(sec,idx):
    # 1. highlight over the picture, the content(#3 line) is empty, only two lines
    # 2. bookmark section only two lines
    # 3. other not correct format < 2
    if len(sec)<=2:
        return False
    # parse #2 line
    section  = defaultdict(dict)
    authinfo = sec[0]
    dateinfo = sec[1]
    content  = sec[2] if len(sec)==3 else None
    das = da.search(dateinfo)
    # type of section
    '''
    STAT :
        START  - start line of section
        BM     - section is a bookmark
        HL     - section is a highlight
        NT     - section is a note
    '''
    tpy   = ('HL' if das.group(2)=='标注' else \
            ('NT' if das.group(2)=='笔记' else 'BM'))
    pos   = das.group(1)
    day   = das.group(3)
    week  = das.group(4)
    pmam  = das.group(5)
    time  = das.group(6)
    # parse #1 line
    aus = au.search(authinfo)
    bookname = aus.group(1)
    author   = aus.group(2)
    section[bookname]['author'] = author
    section['bookname'] = bookname
    section[bookname][str(idx)] = {
            'type':tpy,
            'position':pos,
            'day':day,
            'week':week,
            'meridiem':pmam,
            'time':time,
            'content':content }
    return section
 # format output
 # input: books - dict
 #        f - 'MD'
 #            'TXT'
 #            'JSON'
 # output: 
 #
 def formmat_out(books,f='MD'):
    pass
 # search clip, searching scope may be title/author/content
 # input: books - dict
 #        s - key word
 #        t - 'ALL'
 #            'HL'
 #            'BM'
 #            'NT'
 #        p - 'ALL'
 #            'TITLE'
 #            'AUTHOR'
 #            'CONTENT'
 # output: 
 #
 def search_clip(books, s, t='ALL', p='ALL'):
    pass
 if __name__ == '__main__':
    # 4 lines for each section seperated with '======='
    # so read 4 lines before '======='
    with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
        books   = defaultdict(dict)
        secd    = defaultdict(dict)
        sidx    = 0
        idx     = 0
        sec     = []
        STAT    = 'START'
        for line in f.readlines():
            line = line.strip()
            if re.match(r'^\s*$',line): continue
            idx += 1
            if not re.search(LASTLINE,line):
                # content more than 1 line
                if idx>3:
                    sec[2] += str(' '+line)
                    logger.debug('idx {} {}'.format(idx, sec[2]))
                else:
                    sec.append(line)
                    logger.debug('idx {} {}'.format(idx, sec[idx-1]))
            else:
                idx   = 0
                sidx += 1
                # parsing section & fill data structure
                secd = parse_section(sec,sidx)
                if secd:
                    bn  = secd['bookname']
                    tpy = secd[bn][str(sidx)]['type']
                    books[bn]['author'] = secd[bn]['author']
                    books[bn][str(sidx)] = secd[bn][str(sidx)]
                    if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL':
                        books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
                else: # BM or not correct format section
                    sidx -= 1
                # initial section for next section loop
                sec = []
    # print data with json format
    logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))