kindle manager

2020-06-06 08:43:34 +08:00
parent c0bfa52fc3
commit 722757dadd
8 changed files with 498 additions and 358 deletions
--- a/kman.py
+++ b/kman.py
@@ -8,7 +8,9 @@
 #########################################################

 import re
+import os
 import json
+import time
 import logging
 import platform
 from collections import defaultdict
@@ -107,347 +109,409 @@ r'''
 (\d{1,2}:\d{1,2}:\d{1,2})      #group6 - time
 ''', flags=re.X )

-def parse_section(s,i):
-    """parse section
-    
-    Args:
-        s: section line list
-        i: section index
+class kMan:
+
+    def __init__(self, parent=None):
+        self.hlnum = 0
+        self.ntnum = 0
+        self.refleshtime = '2020/10/10 10:00:00'
+        self.status = self.status_info()
+
+    def status_info(self):
+        s1 = u'Hightlight: {}      Note: {}      RefleshTime: {}'. \
+                format(self.hlnum,self.ntnum,self.refleshtime)
+        kp = self.get_kindle_path()
+
+        if not kp:
+            s2 = u'Disconnected'
+        else:
+            with open(kp+'/system/version.txt' , 'r', encoding='utf8', errors='ignore') as f:
+                s2 = u'Connected ({}) version {}'.format(kp,f.read().strip())
+
+        return [s1,s2]
+
+    def parse_section(self,s,i):
+        """parse section
+        
+        Args:
+            s: section line list
+            i: section index
+
+        Returns:
+            dict like this:
+            d = { 'bookname':bookname,
+                   bookname: {
+                      'author':author
+                      '0':{
+                          'type':'HL',
+                          'position':'123',
+                          'day':'2020年5月26日',
+                          'week':'星期二',
+                          'meridiem':'PM',
+                          'time':'10:26:31'
+                          'content':content }}}
+        """
+        # 1. highlight over the picture, the content(#3 line) is empty, only two lines
+        # 2. bookmark section only two lines
+        # 3. other not correct format < 2
+        if len(s)<=2:
+            return False
+
+        # parse #2 line
+        section  = defaultdict(dict)
+        """
+        authinfo = sec[0]
+        dateinfo = sec[1]
+        content  = sec[2] if len(sec)==3 else None
+        """
+        (authinfo, dateinfo, content) = \
+                (s[0], s[1], s[2] if len(s)==3 else None)
+
+        das = da.search(dateinfo)
+        # type of section
+        '''
+        STAT :
+            START  - start line of section
+            BM     - section is a bookmark
+            HL     - section is a highlight
+            NT     - section is a note
+        '''
+        tpy  = ('HL' if das.group(2)=='标注' else \
+               ('NT' if das.group(2)=='笔记' else 'BM'))
+        """
+        pos  = das.group(1)
+        day  = das.group(3)
+        week = das.group(4)
+        pmam = das.group(5)
+        time = das.group(6)
+        """
+        (pos, x, day, week, pmam, time) = das.groups()[0:6]
+
+        # parse #1 line
+        aus = au.search(authinfo)
+        bookname = aus.group(1)
+        author   = aus.group(2)
+        section[bookname]['author'] = author
+
+        section['bookname'] = bookname
+        section[bookname][str(i)] = {
+                'type':tpy,
+                'position':pos,
+                'day':day,
+                'week':week,
+                'meridiem':pmam,
+                'time':time,
+                'content':content }
+
+        return section
+
+    def format_time(self,ds):
+        """ format date
+        Args:
+            ds: 2020年1月13日 星期一 上午 8:11:05
+        Return:
+            2020/1/13 20:11:05
+        """
+        d = ds.split(' ')
+        res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
+        ymd = '/'.join(res.groups())
+        res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
+        tm  = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)
+
+        return ymd+tm
+
+    def format_data(self,bks, ft='MD'):
+        """ format data for MD & CSV
+
+        Args:
+            bks: books dict
+            f: can be 'MD'/'CSV'
+
+        Return:
+            list [header, sections]
+            header and sections are lists
+        """
+        hd   =[]  # header
+        secs =[]  # content
+        DELIMITER = '|' if ft=='MD' else ','
+
+        hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
+        if ft=='MD':
+            hd.append(DELIMITER.join(['--' for i in range(5)]))
+
+        for kb,vb in bks.items():
+            author = vb['author']
+            for ks, vs in vb.items():
+                if ks in ['author', 'lines']: continue
+                secs.append(DELIMITER.join([vs['type'],kb,author, \
+                        self.format_time(' '.join([vs['day'],vs['week'],\
+                        vs['meridiem'],vs['time']])),vs['content']]))
+
+        return hd+secs
+
+    def format_out(self,bks, fnpref, ft='MD'):
+        """format output and write to file
+        markdown format:
+        TYPE | bookname | author | marktime | content
+        --|--|--|--|--
+        xx|xx|xx|xx|xx
+
+        CSV format:
+        TYPE,bookname,author,marktime,content
+        xx,xx,xx,xx,xx
+
+        marktime: 20200403 PM 3:0:3 星期五
+        
+        Args:
+            bks: books dict
+            f: can be 'MD'/'JSON'/'CSV'
+
+        Returns: special format of 'bks' dict
+        """
+
+        suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
+        op = fnpref+suff[ft]
+
+        with open(op, 'w', encoding='utf8', errors='ignore') as fw:
+            if ft=='JSON':
+                fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
+            elif ft in ['MD','CSV']:
+                for s in self.format_data(bks, ft):
+                    fw.write(s)
+                    fw.write('\n')
+            else:
+                fw.write(json.dumps(bks)) # only for load back
+
+    def drop_duplicate(self,bks):
+        """ drop duplicated section
+
+        If I mark second time in same place, kindle will create two note,
+        so I need to remove the duplication record
+
+        Args:
+            bks: books dict
+        Return:
+            books remove duplicate sections
+        """
+        [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
+        for kb,vb in bks.items():
+            bks[kb]['lines'] = 0
+            # add copy() or throw RuntimeError: dictionary changed size during iteration
+            # reference - http://www.cocoachina.com/articles/89748
+            for ks, vs in vb.copy().items():
+                if ks in ['author', 'lines']: continue
+                bks[kb]['lines'] += 1
+                if (vs['content'] in prevs['content'] or \
+                        prevs['content'] in vs['content']) and \
+                        prevs['type'] == vs['type']:
+                    bks[kb].pop(preks)
+                    #if vs['content'] !=  prevs['content']:
+                    #    print('prevs',prevs['type'],prevs['content'])
+                    #    print('   vs',   vs['type'],   vs['content'])
+
+                preks = ks
+                prevs = vs
+
+        return bks
+
+    def add_note_to_highlight(self,bks):
+        """ append note content to corresponding highlight
+        and remove NT sections
+
+        Args:
+            bks: books dict
+        Return:
+            changed books
+        """
+        [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
+        for kb,vb in bks.items():
+            for ks,vs in vb.copy().items():
+                if ks in ['author', 'lines']: continue
+                if [prevs['type'], vs['type']] == ['HL','NT']:
+                    bks[kb][preks]['content'] += str(NTPREF+vs['content'])
+                    bks[kb].pop(ks)
+
+                preks = ks
+                prevs = vs
+
+        return bks
+
+    def search_clip(self,bks, s, t='ALL', p='ALL'):
+        """search clip, searching scope may be title/author/content
+        Args:
+            input: bks: books dict
+                   s: key word
+                   t: 'ALL'
+                      'HL'
+                      'BM'
+                      'NT'
+                   p: 'ALL'
+                      'TITLE'
+                      'AUTHOR'
+                      'CONTENT'
+        Return: search clipping content
+        """
+        nbks = defaultdict(dict)
+        nu = 0
+        for kb,vb in bks.items():
+            nbks[kb]['lines'] = 0
+            for ks,vs in vb.copy().items():
+                if ks in ['author', 'lines']:
+                    nbks[kb][ks] = vs
+                    continue
+                if t in ['ALL', vs['type']]:
+                    scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
+                            'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
+                    found = re.search(s, scopestr[p])
+                    if found:
+                        nbks[kb][ks] = vs
+                        nbks[kb]['lines'] += 1
+                        nu += 1
+            if nbks[kb]['lines']==0:
+                nbks.pop(kb)
+
+        return [nu,nbks]
+
+    # to be implement
+    def statistic(self,bks):
+        pass
+
+    def dict2json(self,d):
+        """convert dict to json 
+        Args: d is the dict
+        Return: json string
+        """
+        jstr = json.dumps(d)
+        return jstr
+
+    def json2dict(self,jf):
+        """convert dict to json 
+        Args: jf is the file saved json string
+        Return: dict
+        """
+        d = {}
+        with open(jf, 'r', encoding='utf8', errors='ignore') as f:
+            d=json.load(f)
+        return d
+
+    def get_kindle_path(self):
+        """check and return kindle device path
+        Args:
+        Return:
+        if kindle connected, return path string of kindle device
+        else return false
+        """
+        cmd = "wmic logicaldisk get name,volumename" if os.name=='nt'\
+                else ("ls /Volumes/Kindle" if os.name=='posix' else '')
+
+        # not test for windows & linux
+        with os.popen(cmd) as s:
+            r = s.read()
+            if os.name == 'nt':  # windows
+                for d in r.split('\n'):
+                    if 'Kindle' in d: return d.split('\s+')[0]
+            elif os.name == 'posix':  # mac os
+                if r: return('/Volumes/Kindle')
+            else:
+                pass

-    Returns:
-        dict like this:
-        d = { 'bookname':bookname,
-               bookname: {
-                  'author':author
-                  '0':{
-                      'type':'HL',
-                      'position':'123',
-                      'day':'2020年5月26日',
-                      'week':'星期二',
-                      'meridiem':'PM',
-                      'time':'10:26:31'
-                      'content':content }}}
-    """
-    # 1. highlight over the picture, the content(#3 line) is empty, only two lines
-    # 2. bookmark section only two lines
-    # 3. other not correct format < 2
-    if len(s)<=2:
        return False

-    # parse #2 line
-    section  = defaultdict(dict)
-    """
-    authinfo = sec[0]
-    dateinfo = sec[1]
-    content  = sec[2] if len(sec)==3 else None
-    """
-    (authinfo, dateinfo, content) = \
-            (s[0], s[1], s[2] if len(s)==3 else None)
+    def import_clips(self, tp='local'):
+        """import clips from local file or kindle
+        4 lines for each section seperated with '======='
+        so read 4 lines before '======='

-    das = da.search(dateinfo)
-    # type of section
-    '''
-    STAT :
-        START  - start line of section
-        BM     - section is a bookmark
-        HL     - section is a highlight
-        NT     - section is a note
-    '''
-    tpy  = ('HL' if das.group(2)=='标注' else \
-           ('NT' if das.group(2)=='笔记' else 'BM'))
-    """
-    pos  = das.group(1)
-    day  = das.group(3)
-    week = das.group(4)
-    pmam = das.group(5)
-    time = das.group(6)
-    """
-    (pos, x, day, week, pmam, time) = das.groups()[0:6]
-
-    # parse #1 line
-    aus = au.search(authinfo)
-    bookname = aus.group(1)
-    author   = aus.group(2)
-    section[bookname]['author'] = author
-
-    section['bookname'] = bookname
-    section[bookname][str(i)] = {
-            'type':tpy,
-            'position':pos,
-            'day':day,
-            'week':week,
-            'meridiem':pmam,
-            'time':time,
-            'content':content }
-
-    return section
-
-def format_time(ds):
-    """ format date
-    Args:
-        ds: 2020年1月13日 星期一 上午 8:11:05
-    Return:
-        2020/1/13 20:11:05
-    """
-    d = ds.split(' ')
-    res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
-    ymd = '/'.join(res.groups())
-    res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
-    tm  = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)
-
-    return ymd+tm
-
-def format_data(bks, ft='MD'):
-    """ format data for MD & CSV
-
-    Args:
-        bks: books dict
-        f: can be 'MD'/'CSV'
-
-    Return:
-        list [header, sections]
-        header and sections are lists
-    """
-    hd   =[]  # header
-    secs =[]  # content
-    DELIMITER = '|' if ft=='MD' else ','
-
-    hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
-    if ft=='MD':
-        hd.append(DELIMITER.join(['--' for i in range(5)]))
-
-    for kb,vb in bks.items():
-        author = vb['author']
-        for ks, vs in vb.items():
-            if ks in ['author', 'lines']: continue
-            secs.append(DELIMITER.join([vs['type'],kb,author, \
-                    format_time(' '.join([vs['day'],vs['week'],vs['meridiem'],vs['time']])),vs['content']]))
-
-    return hd+secs
-
-def format_out(bks, fnpref, ft='MD'):
-    """format output and write to file
-    markdown format:
-    TYPE | bookname | author | marktime | content
-    --|--|--|--|--
-    xx|xx|xx|xx|xx
-
-    CSV format:
-    TYPE,bookname,author,marktime,content
-    xx,xx,xx,xx,xx
-
-    marktime: 20200403 PM 3:0:3 星期五
-    
-    Args:
-        bks: books dict
-        f: can be 'MD'/'JSON'/'CSV'
-
-    Returns: special format of 'bks' dict
-    """
-
-    suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
-    op = fnpref+suff[ft]
-
-    with open(op, 'w', encoding='utf8', errors='ignore') as fw:
-        if ft=='JSON':
-            fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
-        elif ft in ['MD','CSV']:
-            for s in format_data(bks, ft):
-                fw.write(s)
-                fw.write('\n')
+        Args: tp: 'local' local clipping file
+                  'kindle' kindle clipping file
+        Return: 0 - want to import kindle but kindle is not connected
+                books dict
+        """
+        if tp=='kindle':
+            kp = get_kindle_path()
+            if not kp: return 0
+            else: path = kp
        else:
-            fw.write(json.dumps(bks)) # only for load back
+            path = CLIPPATH

-def drop_duplicate(bks):
-    """ drop duplicated section
+        # loop to fill books dict
+        with open(path, 'r', encoding='utf8', errors='ignore') as f:
+            bks = defaultdict(dict)
+            secd  = defaultdict(dict)
+            sidx  = 0
+            idx   = 0
+            sec   = []
+            for line in f.readlines():
+                line = line.strip()
+                if re.match(r'^\s*$',line): continue
+                idx += 1

-    If I mark second time in same place, kindle will create two note,
-    so I need to remove the duplication record
-
-    Args:
-        bks: books dict
-    Return:
-        books remove duplicate sections
-    """
-    [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
-    for kb,vb in bks.items():
-        bks[kb]['lines'] = 0
-        # add copy() or throw RuntimeError: dictionary changed size during iteration
-        # reference - http://www.cocoachina.com/articles/89748
-        for ks, vs in vb.copy().items():
-            if ks in ['author', 'lines']: continue
-            bks[kb]['lines'] += 1
-            if (vs['content'] in prevs['content'] or \
-                    prevs['content'] in vs['content']) and \
-                    prevs['type'] == vs['type']:
-                bks[kb].pop(preks)
-                #if vs['content'] !=  prevs['content']:
-                #    print('prevs',prevs['type'],prevs['content'])
-                #    print('   vs',   vs['type'],   vs['content'])
-
-            preks = ks
-            prevs = vs
-
-    return bks
-
-def add_note_to_highlight(bks):
-    """ append note content to corresponding highlight
-    and remove NT sections
-
-    Args:
-        bks: books dict
-    Return:
-        changed books
-    """
-    [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
-    for kb,vb in bks.items():
-        for ks,vs in vb.copy().items():
-            if ks in ['author', 'lines']: continue
-            if [prevs['type'], vs['type']] == ['HL','NT']:
-                bks[kb][preks]['content'] += str(NTPREF+vs['content'])
-                bks[kb].pop(ks)
-
-            preks = ks
-            prevs = vs
-
-    return bks
-
-def search_clip(bks, s, t='ALL', p='ALL'):
-    """search clip, searching scope may be title/author/content
-    Args:
-        input: bks: books dict
-               s: key word
-               t: 'ALL'
-                  'HL'
-                  'BM'
-                  'NT'
-               p: 'ALL'
-                  'TITLE'
-                  'AUTHOR'
-                  'CONTENT'
-    Return: search clipping content
-    """
-    nbks = defaultdict(dict)
-    nu = 0
-    for kb,vb in bks.items():
-        nbks[kb]['lines'] = 0
-        for ks,vs in vb.copy().items():
-            if ks in ['author', 'lines']:
-                nbks[kb][ks] = vs
-                continue
-            if t in ['ALL', vs['type']]:
-                scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
-                        'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
-                found = re.search(s, scopestr[p])
-                if found:
-                    nbks[kb][ks] = vs
-                    nbks[kb]['lines'] += 1
-                    nu += 1
-        if nbks[kb]['lines']==0:
-            nbks.pop(kb)
-
-    return [nu,nbks]
-
-# to be implement
-def statistic(bks):
-    pass
-
-def dict2json(d):
-    """convert dict to json 
-    Args: d is the dict
-    Return: json string
-    """
-    jstr = json.dumps(d)
-    return jstr
-
-def json2dict(jf):
-    """convert dict to json 
-    Args: jf is the file saved json string
-    Return: dict
-    """
-    d = {}
-    with open(jf, 'r', encoding='utf8', errors='ignore') as f:
-        d=json.load(f)
-    return d
-
-def import_clips():
-    # 4 lines for each section seperated with '======='
-    # so read 4 lines before '======='
-
-    # loop to fill books dict
-    with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
-        bks = defaultdict(dict)
-        secd  = defaultdict(dict)
-        sidx  = 0
-        idx   = 0
-        sec   = []
-        for line in f.readlines():
-            line = line.strip()
-            if re.match(r'^\s*$',line): continue
-            idx += 1
-
-            if not re.search(LASTLINE,line):
-                # content more than 1 line
-                if idx>3:
-                    sec[2] += str(' '+line)
-                    #logger.debug('idx {} {}'.format(idx, sec[2]))
+                if not re.search(LASTLINE,line):
+                    # content more than 1 line
+                    if idx>3:
+                        sec[2] += str(' '+line)
+                        #logger.debug('idx {} {}'.format(idx, sec[2]))
+                    else:
+                        sec.append(line)
+                        #logger.debug('idx {} {}'.format(idx, sec[idx-1]))
                else:
-                    sec.append(line)
-                    #logger.debug('idx {} {}'.format(idx, sec[idx-1]))
-            else:
-                idx   = 0
-                sidx += 1
+                    idx   = 0
+                    sidx += 1

-                # parsing section & fill data structure
-                secd = parse_section(sec,sidx)
+                    # parsing section & fill data structure
+                    secd = self.parse_section(sec,sidx)

-                if secd:
-                    bn  = secd['bookname']
-                    tpy = secd[bn][str(sidx)]['type']
+                    if secd:
+                        bn  = secd['bookname']
+                        tpy = secd[bn][str(sidx)]['type']

-                    bks[bn]['author'] = secd[bn]['author']
-                    bks[bn][str(sidx)] = secd[bn][str(sidx)]
+                        bks[bn]['author'] = secd[bn]['author']
+                        bks[bn][str(sidx)] = secd[bn][str(sidx)]

-                    # not add note to highlight content here,
-                    # because NT maybe duplicated, we need remove duplication record before
-                    """
-                    if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL':
-                        bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
-                    """
+                        # not add note to highlight content here,
+                        # because NT maybe duplicated, we need remove duplication record before
+                        """
+                        if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL':
+                            bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
+                        """
+                        if tpy=='HL': self.hlnum += 1
+                        elif tpy=='NT': self.ntnum += 1

-                else: # BM or not correct format section
-                    sidx -= 1
+                    else: # BM or not correct format section
+                        sidx -= 1

-                # initial section for next section loop
-                sec = []
-    return bks
+                    # initial section for next section loop
+                    sec = []

+        self.refleshtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+
+        return bks

 if __name__=='__main__':
    #books = defaultdict(dict)
-    books = import_clips()
+    km = kMan()
+    books = km.import_clips('local')

    # remove duplication
-    drop_duplicate(books)
+    km.drop_duplicate(books)

    # test search note function
-    searchnote = search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
-    if searchnote[0] > 0: format_out(searchnote[1], 'searchcontent', ft='MD')
-    searchnote = search_clip(books, '经济', 'ALL', 'TITLE')
-    if searchnote[0] > 0: format_out(searchnote[1], 'searchtitle', ft='MD')
-    searchnote = search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
-    if searchnote[0] > 0: format_out(searchnote[1], 'searchauthor', ft='MD')
+    searchnote = km.search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
+    if searchnote[0] > 0: km.format_out(searchnote[1], 'searchcontent', ft='MD')
+    searchnote = km.search_clip(books, '经济', 'ALL', 'TITLE')
+    if searchnote[0] > 0: km.format_out(searchnote[1], 'searchtitle', ft='MD')
+    searchnote = km.search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
+    if searchnote[0] > 0: km.format_out(searchnote[1], 'searchauthor', ft='MD')

    # add note content to hightlight, then delete note
-    add_note_to_highlight(books)
+    km.add_note_to_highlight(books)

    # test dict json convert
    with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
-        fw.write(dict2json(books))
-    if json2dict('./xx')==books: print( 'test OK')
+        fw.write(km.dict2json(books))
+    if km.json2dict('./xx')==books: print( 'test OK')

-    format_out(books, OUTPREF, ft='MD')
+    km.format_out(books, OUTPREF, ft='MD')

    # print data with json format
    logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))
-