kman/kman.py


#########################################################
## @file   : kman.py
## @desc   : kindle note managerment tool
## @create : 2020/05/26
## @author : Chengan
## @email  : douboer@gmail.com
#########################################################

import re
import os
import io
import json
import time
import logging
import platform
import subprocess
from collections import defaultdict

# data structure - use dict
'''
books =
{
    "bookname_xxx": {
        "author": "李",
        "section1636": {
            "content": "张",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "311-311",
            "time": "3:00:53",
            "type": "HL",
            "week": "星期五"
        },
        "section1651": {
            "content": "治",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "514",
            "time": "3:43:50",
            "type": "NT",
            "week": "星期五"
        },
        "section1814": {
            "content": null,
            "day": "2020年4月12日",
            "meridiem": "下午",
            "position": "5186",
            "time": "2:20:12",
            "type": "BM",
            "week": "星期日"
        },
        ...
    },
    ...
}
'''

# modi clippath for different os
SYS = 'WIN' if platform.system()=='Windows' else \
   ('LINUX' if platform.system()=='LINUX' else 'MAC')

# some constants
LASTLINE = '=========='
NTPREF   = '--CG注:'
#CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
CLIPPATH = './tclip.txt'
OUTPREF  = './clip'
DEBUG    = 1   # 0 - INFO; 1 - DEBUG
LOG2FILE = 1   # 0 - to stdio; 1 - to file
LOGFILE  = 'log'
DELIMITER= '|'
#HEADER   = {0:'type',1:'bookname',2:'author',3:'position',4:'date',5:'content'}

# log info
logger = logging.getLogger()
#formatter = logging.Formatter
#    ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
formatter = logging.Formatter('')

if LOG2FILE:
    handler = logging.FileHandler(LOGFILE)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
else:
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)

if DEBUG:
    logger.setLevel(logging.DEBUG)

#author & bookname info
au = re.compile(
r'''
^\ufeff*
(.+) \(         #bookname
(.+)\)          #author
''', flags=re.X )

# page & date info
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
da = re.compile(
r'''
\#
(\d+-{0,1}\d+)                 #group1 - page
.+
(笔记|标注|书签)               #group2 - type
.+
(\d{4}年\d{1,2}月\d{1,2}日)    #group3 - xxxx年xx月xx日
(星期.)                        #group4 - week
\s
(..)                           #group5 - pm/am
(\d{1,2}:\d{1,2}:\d{1,2})      #group6 - time
''', flags=re.X )

class kMan:

    def __init__(self, parent=None):
        self.hlnum = 0
        self.ntnum = 0
        self.refleshtime = '2020/10/10 10:00:00'
        self.status = self.status_info()

    def status_info(self):
        s1 = u'Hightlight: {}      Note: {}      RefleshTime: {}'. \
                format(self.hlnum,self.ntnum,self.refleshtime)
        kp = self.get_kindle_path()

        if not kp:
            s2 = u'Disconnected ({})'.format(CLIPPATH)
        else:
            with open(kp+'/system/version.txt' , 'r', encoding='utf8', errors='ignore') as f:
                s2 = u'Connected ({}) version {}'.format(kp,f.read().strip())

        return [s1,s2]

    def parse_section(self,s,i):
        """parse section

        Args:
            s: section line list
            i: section index

        Returns:
            dict like this:
            d = { 'bookname':bookname,
                   bookname: {
                      'author':author
                      '0':{
                          'type':'HL',
                          'position':'123',
                          'day':'2020年5月26日',
                          'week':'星期二',
                          'meridiem':'PM',
                          'time':'10:26:31'
                          'content':content }}}
        """
        # 1. highlight over the picture, the content(#3 line) is empty, only two lines
        # 2. bookmark section only two lines
        # 3. other not correct format < 2
        if len(s)<=2:
            return False

        # parse #2 line
        section  = defaultdict(dict)
        """
        authinfo = sec[0]
        dateinfo = sec[1]
        content  = sec[2] if len(sec)==3 else None
        """
        (authinfo, dateinfo, content) = \
                (s[0], s[1], s[2] if len(s)==3 else None)

        das = da.search(dateinfo)
        # type of section
        '''
        STAT :
            START  - start line of section
            BM     - section is a bookmark
            HL     - section is a highlight
            NT     - section is a note
        '''
        tpy  = ('HL' if das.group(2)=='标注' else \
               ('NT' if das.group(2)=='笔记' else 'BM'))
        """
        pos  = das.group(1)
        day  = das.group(3)
        week = das.group(4)
        pmam = das.group(5)
        time = das.group(6)
        """
        (pos, x, day, week, pmam, time) = das.groups()[0:6]

        # parse #1 line
        aus = au.search(authinfo)
        bookname = aus.group(1).strip()
        author   = aus.group(2).strip()
        section[bookname]['author'] = author

        section['bookname'] = bookname
        section[bookname][str(i)] = {
                'type':tpy,
                'position':pos,
                'day':day,
                'week':week,
                'meridiem':pmam,
                'time':time,
                'content':content }

        return section

    def format_time(self,ds):
        """ format date
        Args:
            ds: 2020年1月13日 星期一 上午 8:11:05
        Return:
            2020/1/13 20:11:05
        """
        d = ds.split(' ')
        res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
        ymd = '/'.join(res.groups())
        res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
        tm  = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)

        return ymd+tm

    def format_data(self,bks, ft='MD'):
        """ format data for MD & CSV

        Args:
            bks: books dict
            f: can be 'MD'/'CSV'

        Return:
            list [header, sections]
            header and sections are lists
        """
        hd   =[]  # header
        secs =[]  # content
        DELIMITER = '|' if ft=='MD' else ','

        hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
        if ft=='MD':
            hd.append(DELIMITER.join(['--' for i in range(5)]))

        for kb,vb in bks.items():
            author = vb['author']
            for ks, vs in vb.items():
                if ks in ['author', 'lines']: continue
                secs.append(DELIMITER.join([vs['type'],kb,author, \
                        self.format_time(' '.join([vs['day'],vs['week'],\
                        vs['meridiem'],vs['time']])),vs['content']]))

        return hd+secs

    def format_out(self,bks, fnpref, ft='MD'):
        """format output and write to file
        markdown format:
        TYPE | bookname | author | marktime | content
        --|--|--|--|--
        xx|xx|xx|xx|xx

        CSV format:
        TYPE,bookname,author,marktime,content
        xx,xx,xx,xx,xx

        marktime: 20200403 PM 3:0:3 星期五

        Args:
            bks: books dict
            f: can be 'MD'/'JSON'/'CSV'

        Returns: special format of 'bks' dict
        """

        suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
        op = fnpref+suff[ft]

        with open(op, 'w', encoding='utf8', errors='ignore') as fw:
            if ft=='JSON':
                fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
            elif ft in ['MD','CSV']:
                for s in self.format_data(bks, ft):
                    fw.write(s)
                    fw.write('\n')
            else:
                fw.write(json.dumps(bks)) # only for load back

    def drop_duplicate(self,bks):
        """ drop duplicated section

        If I mark second time in same place, kindle will create two note,
        so I need to remove the duplication record

        Args:
            bks: books dict
        Return:
            books remove duplicate sections
        """
        [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
        for kb,vb in bks.items():
            bks[kb]['lines'] = 0
            # add copy() or throw RuntimeError: dictionary changed size during iteration
            # reference - http://www.cocoachina.com/articles/89748
            for ks, vs in vb.copy().items():
                if ks in ['author', 'lines']: continue
                bks[kb]['lines'] += 1
                if (vs['content'] in prevs['content'] or \
                        prevs['content'] in vs['content']) and \
                        prevs['type'] == vs['type']:
                    bks[kb].pop(preks)
                    #if vs['content'] !=  prevs['content']:
                    #    print('prevs',prevs['type'],prevs['content'])
                    #    print('   vs',   vs['type'],   vs['content'])

                preks = ks
                prevs = vs

        return bks

    def get_bookname_num(self,bks):
        """ get note number of booknames
        Args:
            bks: books dict
        Return: dict {bookname:num,...}
        """
        bksnum = defaultdict(dict)
        nu = 0
        for kb,vb in bks.items():
            bksnum.setdefault(kb, 0)
            for ks, vs in vb.copy().items():
                if ks in ['author', 'lines']: continue
                bksnum[kb] += 1
                nu += 1

        return [nu, bksnum]

    def get_author_num(self,bks):
        """ get note number of author
        Args:
            bks: books dict
        Return: dict {bookname:num,...}
        """
        bksnum = defaultdict(dict)
        nu = 0
        for kb,vb in bks.items():
            for ks, vs in vb.copy().items():
                if ks in ['author', 'lines']: continue
                au = vb['author']
                bksnum.setdefault(au, 0)
                bksnum[au] += 1
                nu += 1

        return [nu, bksnum]

    def filter_clips(self, bks, info=None, tp=0):
        """ filter clips
        Args:
            bks: books dict
            info: filter by bookname or author information
            tp: type to be filter
                0: root item clicked
                1: bookname item clicked
                2: author item clicked
        Return: list:
            [[Type,Bookname,Author,Position,Date,content],
             [Type,Bookname,Author,Position,Date,content]
            ....]
        """
        nbks = defaultdict(dict)

        # do not filter
        if tp==0: nbks = bks

        # do filter
        for kb, vb in bks.items():
            if [info, tp] in ([kb, 1], [vb['author'], 2]):
                nbks[kb] = vb

        seclist = []
        idx = 0
        for kb, vb in nbks.items():
            for ks,vs in vb.items():
                if ks in ['author', 'lines']: continue
                tm = self.format_time(' '.join([vs['day'],vs['week'], \
                        vs['meridiem'],vs['time']]))
                nttype = '标注' if vs['type']=='HL' else '笔记'
                seclist.append([nttype,kb,vb['author'],vs['position'],tm,vs['content']])
                idx += 1

        return seclist

    def add_note_to_highlight(self,bks):
        """ append note content to corresponding highlight
        and remove NT sections

        Args:
            bks: books dict
        Return:
            changed books
        """
        [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
        for kb,vb in bks.items():
            for ks,vs in vb.copy().items():
                if ks in ['author', 'lines']: continue
                if [prevs['type'], vs['type']] == ['HL','NT']:
                    bks[kb][preks]['content'] += str(NTPREF+vs['content'])
                    bks[kb].pop(ks)

                preks = ks
                prevs = vs

        return bks

    def search_clip(self,bks, s, t='ALL', p='ALL'):
        """search clip, searching scope may be title/author/content
        Args:
            input: bks: books dict
                   s: key word
                   t: 'ALL'
                      'HL'
                      'BM'
                      'NT'
                   p: 'ALL'
                      'TITLE'
                      'AUTHOR'
                      'CONTENT'
        Return:
        [number of result , result dict]
        """
        nbks = defaultdict(dict)
        nu = 0
        for kb,vb in bks.items():
            nbks[kb]['lines'] = 0
            for ks,vs in vb.copy().items():
                if ks in ['author', 'lines']:
                    nbks[kb][ks] = vs
                    continue
                if t in ['ALL', vs['type']]:
                    scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
                            'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
                    found = re.search(s, scopestr[p])
                    if found:
                        nbks[kb][ks] = vs
                        nbks[kb]['lines'] += 1
                        nu += 1
            if nbks[kb]['lines']==0:
                nbks.pop(kb)

        return [nu,nbks]

    # to be implement
    def statistic(self,bks):
        pass

    def dict2json(self,d):
        """convert dict to json
        Args: d is the dict
        Return: json string
        """
        jstr = json.dumps(d)
        return jstr

    def json2dict(self,jf):
        """convert dict to json
        Args: jf is the file saved json string
        Return: dict
        """
        d = {}
        with open(jf, 'r', encoding='utf8', errors='ignore') as f:
            d=json.load(f)
        return d

    def get_kindle_path(self):
        """check and return kindle device path
        Args:
        Return:
        if kindle connected, return path string of kindle device
        else return false
        """
        cmd = "wmic logicaldisk get name,volumename" if os.name=='nt'\
                else ("ls /Volumes/Kindle" if os.name=='posix' else '')

        # not test for windows & linux
        with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, \
                stderr=subprocess.PIPE, bufsize=-1) as s:
            stream_stdout = io.TextIOWrapper(s.stdout, encoding='utf-8')
            stream_stderr = io.TextIOWrapper(s.stderr, encoding='utf-8')
            sout = str(stream_stdout.read())
            #serr = str(stream_stderr.read())
            #if sout: print('stdout {}'.format(sout))
            #if serr: print('stderr {}'.format(serr))
            if os.name == 'nt':  # windows
                for d in sout.split('\n'):
                    if 'Kindle' in d: return d.split('\s+')[0]
            elif os.name == 'posix':  # mac os
                if sout: return('/Volumes/Kindle')
            else:
                pass

        """
        # will print error information on stdout
        with os.popen(cmd) as s:
            sout = s.read()
            if os.name == 'nt':  # windows
                for d in sout.split('\n'):
                    if 'Kindle' in d: return d.split('\s+')[0]
            elif os.name == 'posix':  # mac os
                if sout: return('/Volumes/Kindle')
            else:
                pass
        """

        return False

    def import_clips(self, fp=CLIPPATH):
        """import clips from local file or kindle
        4 lines for each section seperated with '======='
        so read 4 lines before '======='

        Args: fp - file path
        Return: 0 - want to import kindle but kindle is not connected
                books dict
        """
        # check kindle by user just call get_kindle_path()
        """
        if tp=='kindle':
            kp = get_kindle_path()
            if not kp: return 0
            else: path = kp
        else:
            path = fn
        """

        # loop to fill books dict
        with open(fp, 'r', encoding='utf8', errors='ignore') as f:
            bks = defaultdict(dict)
            secd  = defaultdict(dict)
            sidx  = 0
            idx   = 0
            sec   = []
            for line in f.readlines():
                line = line.strip()
                if re.match(r'^\s*$',line): continue
                idx += 1

                if not re.search(LASTLINE,line):
                    # content more than 1 line
                    if idx>3:
                        sec[2] += str(' '+line)
                        #logger.debug('idx {} {}'.format(idx, sec[2]))
                    else:
                        sec.append(line)
                        #logger.debug('idx {} {}'.format(idx, sec[idx-1]))
                else:
                    idx   = 0
                    sidx += 1

                    # parsing section & fill data structure
                    secd = self.parse_section(sec,sidx)

                    if secd:
                        bn  = secd['bookname']
                        tpy = secd[bn][str(sidx)]['type']

                        bks[bn]['author'] = secd[bn]['author']
                        bks[bn][str(sidx)] = secd[bn][str(sidx)]

                        # not add note to highlight content here,
                        # because NT maybe duplicated, we need remove duplication record before
                        """
                        if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL':
                            bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
                        """
                        if tpy=='HL': self.hlnum += 1
                        elif tpy=='NT': self.ntnum += 1

                    else: # BM or not correct format section
                        sidx -= 1

                    # initial section for next section loop
                    sec = []

        self.refleshtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

        return bks

if __name__=='__main__':
    #books = defaultdict(dict)
    km = kMan()
    books = km.import_clips()

    # remove duplication
    km.drop_duplicate(books)

    # test search note function
    searchnote = km.search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
    if searchnote[0] > 0: km.format_out(searchnote[1], 'searchcontent', ft='MD')
    searchnote = km.search_clip(books, '经济', 'ALL', 'TITLE')
    if searchnote[0] > 0: km.format_out(searchnote[1], 'searchtitle', ft='MD')
    searchnote = km.search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
    if searchnote[0] > 0: km.format_out(searchnote[1], 'searchauthor', ft='MD')

    print(km.get_bookname_num(books))
    print(km.get_author_num(books))

    # add note content to hightlight, then delete note
    km.add_note_to_highlight(books)

    # test dict json convert
    with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
        fw.write(km.dict2json(books))
    if km.json2dict('./xx')==books: print( 'test OK')

    km.format_out(books, OUTPREF, ft='MD')

    # print data with json format
    logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))