kindle manager

2020-05-28 13:19:51 +08:00
parent 1aee17aa64
commit 12a7d01ad4
1 changed files with 240 additions and 0 deletions
--- a/kread.py
+++ b/kread.py
@@ -0,0 +1,240 @@
+
+#############################################
+##   PROGRAM: kman.py
+##   AUTHOR:  Chengan
+##   CREATE:  20200526
+##   douboer@gmail.com
+#############################################
+
+import platform
+import re
+import json
+import logging
+from collections import defaultdict
+
+# data structure - use dict
+'''
+books =
+{
+    "bookname_xxx": {
+        "author": "李",
+        "section1636": {
+            "content": "张",
+            "day": "2020年4月3日",
+            "meridiem": "下午",
+            "position": "311-311",
+            "time": "3:00:53",
+            "type": "HL",
+            "week": "星期五"
+        },
+        "section1651": {
+            "content": "治",
+            "day": "2020年4月3日",
+            "meridiem": "下午",
+            "position": "514",
+            "time": "3:43:50",
+            "type": "NT",
+            "week": "星期五"
+        },
+        "section1814": {
+            "content": null,
+            "day": "2020年4月12日",
+            "meridiem": "下午",
+            "position": "5186",
+            "time": "2:20:12",
+            "type": "BM",
+            "week": "星期日"
+        },
+        ...
+    },
+    ...
+}
+'''
+
+# modi clippath for different os
+SYS = 'WIN' if platform.system()=='Windows' else \
+   ('LINUX' if platform.system()=='LINUX' else 'MAC')
+
+# some constants
+LASTLINE = '=========='
+NTPREF   = '--CG注:'
+CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
+STAT     = 'NONE'
+DEBUG    = 1   # 0 - INFO; 1 - DEBUG
+LOG2FILE = 1   # 0 - to stdio; 1 - to file
+
+# log info
+logger = logging.getLogger()
+#formatter = logging.Formatter
+#    ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
+formatter = logging.Formatter('')
+
+if LOG2FILE:
+    handler = logging.FileHandler("log")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+else:
+    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
+
+if DEBUG:
+    logger.setLevel(logging.DEBUG)
+
+#author & bookname info
+au = re.compile(
+r'''
+^\ufeff*
+(.+) \(         #bookname
+(.+)\)          #author
+''', flags=re.X )
+
+# page & date info
+#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
+da = re.compile(
+r'''
+\#
+(\d+-{0,1}\d+)                 #group1 - page
+.+
+(笔记|标注|书签)               #group2 - type
+.+
+(\d{4}年\d{1,2}月\d{1,2}日)    #group3 - xxxx年xx月xx日
+(星期.)                        #group4 - week
+\s
+(..)                           #group5 - pm/am
+(\d{1,2}:\d{1,2}:\d{1,2})      #group6 - time
+''', flags=re.X )
+
+# input: section dict & and section index
+# return: dict
+#   d = { 'bookname':bookname,
+#          bookname: {
+#             'author':author
+#             'section0':{
+#                 'type':'HL',
+#                 'position':'123',
+#                 'day':'2020年5月26日',
+#                 'week':'星期二',
+#                 'meridiem':'PM',
+#                 'time':'10:26:31'
+#                 'content':content }}}
+def parse_section(sec,idx):
+    # 1. highlight over the picture, the content(#3 line) is empty, only two lines
+    # 2. bookmark section only two lines
+    # 3. other not correct format < 2
+    if len(sec)<=2:
+        return False
+
+    # parse #2 line
+    section  = defaultdict(dict)
+    authinfo = sec[0]
+    dateinfo = sec[1]
+    content  = sec[2] if len(sec)==3 else None
+
+    das = da.search(dateinfo)
+    # type of section
+    '''
+    STAT :
+        START  - start line of section
+        BM     - section is a bookmark
+        HL     - section is a highlight
+        NT     - section is a note
+    '''
+    tpy   = ('HL' if das.group(2)=='标注' else \
+            ('NT' if das.group(2)=='笔记' else 'BM'))
+    pos   = das.group(1)
+    day   = das.group(3)
+    week  = das.group(4)
+    pmam  = das.group(5)
+    time  = das.group(6)
+
+    # parse #1 line
+    aus = au.search(authinfo)
+    bookname = aus.group(1)
+    author   = aus.group(2)
+    section[bookname]['author'] = author
+
+    section['bookname'] = bookname
+    section[bookname][str(idx)] = {
+            'type':tpy,
+            'position':pos,
+            'day':day,
+            'week':week,
+            'meridiem':pmam,
+            'time':time,
+            'content':content }
+
+    return section
+
+# format output
+# input: books - dict
+#        f - 'MD'
+#            'TXT'
+#            'JSON'
+# output: 
+#
+def formmat_out(books,f='MD'):
+    pass
+
+# search clip, searching scope may be title/author/content
+# input: books - dict
+#        s - key word
+#        t - 'ALL'
+#            'HL'
+#            'BM'
+#            'NT'
+#        p - 'ALL'
+#            'TITLE'
+#            'AUTHOR'
+#            'CONTENT'
+# output: 
+#
+def search_clip(books, s, t='ALL', p='ALL'):
+    pass
+
+if __name__ == '__main__':
+    # 4 lines for each section seperated with '======='
+    # so read 4 lines before '======='
+    with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
+        books   = defaultdict(dict)
+        secd    = defaultdict(dict)
+        sidx    = 0
+        idx     = 0
+        sec     = []
+        STAT    = 'START'
+        for line in f.readlines():
+            line = line.strip()
+            if re.match(r'^\s*$',line): continue
+            idx += 1
+
+            if not re.search(LASTLINE,line):
+                # content more than 1 line
+                if idx>3:
+                    sec[2] += str(' '+line)
+                    logger.debug('idx {} {}'.format(idx, sec[2]))
+                else:
+                    sec.append(line)
+                    logger.debug('idx {} {}'.format(idx, sec[idx-1]))
+            else:
+                idx   = 0
+                sidx += 1
+
+                # parsing section & fill data structure
+                secd = parse_section(sec,sidx)
+
+                if secd:
+                    bn  = secd['bookname']
+                    tpy = secd[bn][str(sidx)]['type']
+
+                    books[bn]['author'] = secd[bn]['author']
+                    books[bn][str(sidx)] = secd[bn][str(sidx)]
+
+                    if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL':
+                        books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
+                else: # BM or not correct format section
+                    sidx -= 1
+
+                # initial section for next section loop
+                sec = []
+
+    # print data with json format
+    logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))
+