kindle manager

2020-05-30 21:32:48 +08:00
parent dffae936d8
commit 87d661fcb6
3 changed files with 198 additions and 429 deletions
--- a/kman.py
+++ b/kman.py
@@ -2,7 +2,7 @@
 #########################################################
 ## @file   : kman.py
 ## @desc   : kindle note managerment tool
-## @create : 20200526
+## @create : 2020/05/26
 ## @author : Chengan
 ## @email  : douboer@gmail.com
 #########################################################
@@ -63,6 +63,7 @@ CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
 OUTPREF  = './clip'
 DEBUG    = 1   # 0 - INFO; 1 - DEBUG
 LOG2FILE = 1   # 0 - to stdio; 1 - to file
+DELIMITER= '|'

 # log info
 logger = logging.getLogger()
@@ -125,8 +126,6 @@ def parse_section(s,i):
                      'time':'10:26:31'
                      'content':content }}}
    """
-
-
    # 1. highlight over the picture, the content(#3 line) is empty, only two lines
    # 2. bookmark section only two lines
    # 3. other not correct format < 2
@@ -135,9 +134,13 @@ def parse_section(s,i):

    # parse #2 line
    section  = defaultdict(dict)
+    """
    authinfo = sec[0]
    dateinfo = sec[1]
    content  = sec[2] if len(sec)==3 else None
+    """
+    (authinfo, dateinfo, content) = \
+            (s[0], s[1], s[2] if len(s)==3 else None)

    das = da.search(dateinfo)
    # type of section
@@ -150,11 +153,14 @@ def parse_section(s,i):
    '''
    tpy  = ('HL' if das.group(2)=='标注' else \
           ('NT' if das.group(2)=='笔记' else 'BM'))
+    """
    pos  = das.group(1)
    day  = das.group(3)
    week = das.group(4)
    pmam = das.group(5)
    time = das.group(6)
+    """
+    (pos, x, day, week, pmam, time) = das.groups()[0:6]

    # parse #1 line
    aus = au.search(authinfo)
@@ -174,9 +180,52 @@ def parse_section(s,i):

    return section

-def format_out(bks, ft='MD'):
+def format_time(ds):
+    """ format date
+    Args:
+        ds: 2020年1月13日 星期一 上午 8:11:05
+    Return:
+        2020/1/13 20:11:05
+    """
+    d = ds.split(' ')
+    res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
+    ymd = '/'.join(res.groups())
+    res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
+    tm  = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)
+
+    return ymd+tm
+
+def format_data(bks, ft='MD'):
+    """ format data for MD & CSV
+
+    Args:
+        bks: books dict
+        f: can be 'MD'/'CSV'
+
+    Return:
+        list [header, sections]
+        header and sections are lists
+    """
+    hd   =[]  # header
+    secs =[]  # content
+    DELIMITER = '|' if ft=='MD' else ','
+
+    hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
+    if ft=='MD':
+        hd.append(DELIMITER.join(['--' for i in range(5)]))
+
+    for kb,vb in bks.items():
+        author = vb['author']
+        for ks, vs in vb.items():
+            if ks in ['author', 'lines']: continue
+            secs.append(DELIMITER.join([vs['type'],kb,author, \
+                    format_time(' '.join([vs['day'],vs['week'],vs['meridiem'],vs['time']])),vs['content']]))
+
+    return hd+secs
+
+def format_out(bks, fnpref, ft='MD'):
    """format output and write to file
-    MARKDOWN format:
+    markdown format:
    TYPE | bookname | author | marktime | content
    --|--|--|--|--
    xx|xx|xx|xx|xx
@@ -195,15 +244,107 @@ def format_out(bks, ft='MD'):
    """

    suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
-    op = OUTPREF+suff[ft]
+    op = fnpref+suff[ft]

-    with open(op, 'w', encoding='gbk', errors='ignore') as fw:
+    with open(op, 'w', encoding='utf8', errors='ignore') as fw:
        if ft=='JSON':
-            ft.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
-        elif ft=='MD':
-            pass
+            fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
+        elif ft in ['MD','CSV']:
+            for s in format_data(bks, ft):
+                fw.write(s)
+                fw.write('\n')
        else:
-            ft.write(json.dumps(bks)) # only for load back
+            fw.write(json.dumps(bks)) # only for load back
+
+def drop_duplicate(bks):
+    """ drop duplicated section
+
+    If I mark second time in same place, kindle will create two note,
+    so I need to remove the duplication record
+
+    Args:
+        bks: books dict
+    Return:
+        books remove duplicate sections
+    """
+    [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
+    for kb,vb in bks.items():
+        bks[kb]['lines'] = 0
+        # add copy() or throw RuntimeError: dictionary changed size during iteration
+        # reference - http://www.cocoachina.com/articles/89748
+        for ks, vs in vb.copy().items():
+            if ks in ['author', 'lines']: continue
+            bks[kb]['lines'] += 1
+            if (vs['content'] in prevs['content'] or \
+                    prevs['content'] in vs['content']) and \
+                    prevs['type'] == vs['type']:
+                bks[kb].pop(preks)
+                if vs['content'] !=  prevs['content']:
+                    print('prevs',prevs['type'],prevs['content'])
+                    print('   vs',   vs['type'],   vs['content'])
+
+            preks = ks
+            prevs = vs
+
+    return bks
+
+def add_note_to_highlight(bks):
+    """ append note content to corresponding highlight
+    and remove NT sections
+
+    Args:
+        bks: books dict
+    Return:
+        changed books
+    """
+    [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
+    for kb,vb in bks.items():
+        for ks,vs in vb.copy().items():
+            if ks in ['author', 'lines']: continue
+            if [prevs['type'], vs['type']] == ['HL','NT']:
+                bks[kb][preks]['content'] += str(NTPREF+vs['content'])
+                bks[kb].pop(ks)
+
+            preks = ks
+            prevs = vs
+
+    return bks
+
+def search_clip(bks, s, t='ALL', p='ALL'):
+    """search clip, searching scope may be title/author/content
+    Args:
+        input: bks: books dict
+               s: key word
+               t: 'ALL'
+                  'HL'
+                  'BM'
+                  'NT'
+               p: 'ALL'
+                  'TITLE'
+                  'AUTHOR'
+                  'CONTENT'
+    Return: search clipping content
+    """
+    nbks = defaultdict(dict)
+    nu = 0
+    for kb,vb in bks.items():
+        nbks[kb]['lines'] = 0
+        for ks,vs in vb.copy().items():
+            if ks in ['author', 'lines']:
+                nbks[kb][ks] = vs
+                continue
+            if t in ['ALL', vs['type']]:
+                scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
+                        'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
+                found = re.search(s, scopestr[p])
+                if found:
+                    nbks[kb][ks] = vs
+                    nbks[kb]['lines'] += 1
+                    nu += 1
+        if nbks[kb]['lines']==0:
+            nbks.pop(kb)
+
+    return [nu,nbks]

 def statistic(bks):
    pass
@@ -226,26 +367,11 @@ def json2dict(jf):
        d=json.load(f)
    return d

-def search_clip(bks, s, t='ALL', p='ALL'):
-    """search clip, searching scope may be title/author/content
-    Args:
-        input: bks: books dict
-               s: key word
-               t: 'ALL'
-                  'HL'
-                  'BM'
-                  'NT'
-               p: 'ALL'
-                  'TITLE'
-                  'AUTHOR'
-                  'CONTENT'
-    Return: search clipping content
-    """
-    pass
-
 if __name__=='__main__':
    # 4 lines for each section seperated with '======='
    # so read 4 lines before '======='
+
+    # loop to fill books dict
    with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
        books = defaultdict(dict)
        secd  = defaultdict(dict)
@@ -261,10 +387,10 @@ if __name__=='__main__':
                # content more than 1 line
                if idx>3:
                    sec[2] += str(' '+line)
-                    logger.debug('idx {} {}'.format(idx, sec[2]))
+                    #logger.debug('idx {} {}'.format(idx, sec[2]))
                else:
                    sec.append(line)
-                    logger.debug('idx {} {}'.format(idx, sec[idx-1]))
+                    #logger.debug('idx {} {}'.format(idx, sec[idx-1]))
            else:
                idx   = 0
                sidx += 1
@@ -279,19 +405,41 @@ if __name__=='__main__':
                    books[bn]['author'] = secd[bn]['author']
                    books[bn][str(sidx)] = secd[bn][str(sidx)]

+                    # not add note to highlight content here,
+                    # because NT maybe duplicated, we need remove duplication record before
+                    """
                    if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL':
                        books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
+                    """
+
                else: # BM or not correct format section
                    sidx -= 1

                # initial section for next section loop
                sec = []

+    # remove duplication
+    drop_duplicate(books)
+
+    # test search note function
+    searchnote = search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
+    if searchnote[0] > 0: format_out(searchnote[1], 'searchcontent', ft='MD')
+    searchnote = search_clip(books, '经济', 'ALL', 'TITLE')
+    if searchnote[0] > 0: format_out(searchnote[1], 'searchtitle', ft='MD')
+    searchnote = search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
+    if searchnote[0] > 0: format_out(searchnote[1], 'searchauthor', ft='MD')
+
+    # add note content to hightlight, then delete note
+    add_note_to_highlight(books)
+
    # test dict json convert
    with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
        fw.write(dict2json(books))
    if json2dict('./xx')==books: print( 'test OK')

-# print data with json format
-logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))
+    format_out(books, OUTPREF, ft='MD')
+
+    # print data with json format
+    logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))
+