kman/kman.py


#########################################################
## @file   : kman.py
## @desc   : kindle note managerment tool
## @create : 2020/05/26
## @author : Chengan
## @email  : douboer@gmail.com
#########################################################

import re
import json
import logging
import platform
from collections import defaultdict

# data structure - use dict
'''
books =
{
    "bookname_xxx": {
        "author": "李",
        "section1636": {
            "content": "张",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "311-311",
            "time": "3:00:53",
            "type": "HL",
            "week": "星期五"
        },
        "section1651": {
            "content": "治",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "514",
            "time": "3:43:50",
            "type": "NT",
            "week": "星期五"
        },
        "section1814": {
            "content": null,
            "day": "2020年4月12日",
            "meridiem": "下午",
            "position": "5186",
            "time": "2:20:12",
            "type": "BM",
            "week": "星期日"
        },
        ...
    },
    ...
}
'''

# modi clippath for different os
SYS = 'WIN' if platform.system()=='Windows' else \
   ('LINUX' if platform.system()=='LINUX' else 'MAC')

# some constants
LASTLINE = '=========='
NTPREF   = '--CG注:'
#CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
CLIPPATH = './tclip.txt'
OUTPREF  = './clip'
DEBUG    = 1   # 0 - INFO; 1 - DEBUG
LOG2FILE = 1   # 0 - to stdio; 1 - to file
LOGFILE  = 'log'
DELIMITER= '|'

# log info
logger = logging.getLogger()
#formatter = logging.Formatter
#    ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
formatter = logging.Formatter('')

if LOG2FILE:
    handler = logging.FileHandler(LOGFILE)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
else:
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)

if DEBUG:
    logger.setLevel(logging.DEBUG)

#author & bookname info
au = re.compile(
r'''
^\ufeff*
(.+) \(         #bookname
(.+)\)          #author
''', flags=re.X )

# page & date info
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
da = re.compile(
r'''
\#
(\d+-{0,1}\d+)                 #group1 - page
.+
(笔记|标注|书签)               #group2 - type
.+
(\d{4}年\d{1,2}月\d{1,2}日)    #group3 - xxxx年xx月xx日
(星期.)                        #group4 - week
\s
(..)                           #group5 - pm/am
(\d{1,2}:\d{1,2}:\d{1,2})      #group6 - time
''', flags=re.X )

def parse_section(s,i):
    """parse section

    Args:
        s: section line list
        i: section index

    Returns:
        dict like this:
        d = { 'bookname':bookname,
               bookname: {
                  'author':author
                  '0':{
                      'type':'HL',
                      'position':'123',
                      'day':'2020年5月26日',
                      'week':'星期二',
                      'meridiem':'PM',
                      'time':'10:26:31'
                      'content':content }}}
    """
    # 1. highlight over the picture, the content(#3 line) is empty, only two lines
    # 2. bookmark section only two lines
    # 3. other not correct format < 2
    if len(s)<=2:
        return False

    # parse #2 line
    section  = defaultdict(dict)
    """
    authinfo = sec[0]
    dateinfo = sec[1]
    content  = sec[2] if len(sec)==3 else None
    """
    (authinfo, dateinfo, content) = \
            (s[0], s[1], s[2] if len(s)==3 else None)

    das = da.search(dateinfo)
    # type of section
    '''
    STAT :
        START  - start line of section
        BM     - section is a bookmark
        HL     - section is a highlight
        NT     - section is a note
    '''
    tpy  = ('HL' if das.group(2)=='标注' else \
           ('NT' if das.group(2)=='笔记' else 'BM'))
    """
    pos  = das.group(1)
    day  = das.group(3)
    week = das.group(4)
    pmam = das.group(5)
    time = das.group(6)
    """
    (pos, x, day, week, pmam, time) = das.groups()[0:6]

    # parse #1 line
    aus = au.search(authinfo)
    bookname = aus.group(1)
    author   = aus.group(2)
    section[bookname]['author'] = author

    section['bookname'] = bookname
    section[bookname][str(i)] = {
            'type':tpy,
            'position':pos,
            'day':day,
            'week':week,
            'meridiem':pmam,
            'time':time,
            'content':content }

    return section

def format_time(ds):
    """ format date
    Args:
        ds: 2020年1月13日 星期一 上午 8:11:05
    Return:
        2020/1/13 20:11:05
    """
    d = ds.split(' ')
    res = re.search(r'(\d{4}).(\d{1,2}).(\d{1,2})',d[0])
    ymd = '/'.join(res.groups())
    res = re.search(r'(\d{1,2})(:\d{1,2}:\d{1,2})',d[3])
    tm  = ' '+str(int(res.group(1)) + (0 if d[2]=='上午' else 12))+res.group(2)

    return ymd+tm

def format_data(bks, ft='MD'):
    """ format data for MD & CSV

    Args:
        bks: books dict
        f: can be 'MD'/'CSV'

    Return:
        list [header, sections]
        header and sections are lists
    """
    hd   =[]  # header
    secs =[]  # content
    DELIMITER = '|' if ft=='MD' else ','

    hd.append(DELIMITER.join(['TYPE','BOOKNAME','AUTHOR','MARKTIME','CONTENT']))
    if ft=='MD':
        hd.append(DELIMITER.join(['--' for i in range(5)]))

    for kb,vb in bks.items():
        author = vb['author']
        for ks, vs in vb.items():
            if ks in ['author', 'lines']: continue
            secs.append(DELIMITER.join([vs['type'],kb,author, \
                    format_time(' '.join([vs['day'],vs['week'],vs['meridiem'],vs['time']])),vs['content']]))

    return hd+secs

def format_out(bks, fnpref, ft='MD'):
    """format output and write to file
    markdown format:
    TYPE | bookname | author | marktime | content
    --|--|--|--|--
    xx|xx|xx|xx|xx

    CSV format:
    TYPE,bookname,author,marktime,content
    xx,xx,xx,xx,xx

    marktime: 20200403 PM 3:0:3 星期五

    Args:
        bks: books dict
        f: can be 'MD'/'JSON'/'CSV'

    Returns: special format of 'bks' dict
    """

    suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
    op = fnpref+suff[ft]

    with open(op, 'w', encoding='utf8', errors='ignore') as fw:
        if ft=='JSON':
            fw.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
        elif ft in ['MD','CSV']:
            for s in format_data(bks, ft):
                fw.write(s)
                fw.write('\n')
        else:
            fw.write(json.dumps(bks)) # only for load back

def drop_duplicate(bks):
    """ drop duplicated section

    If I mark second time in same place, kindle will create two note,
    so I need to remove the duplication record

    Args:
        bks: books dict
    Return:
        books remove duplicate sections
    """
    [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
    for kb,vb in bks.items():
        bks[kb]['lines'] = 0
        # add copy() or throw RuntimeError: dictionary changed size during iteration
        # reference - http://www.cocoachina.com/articles/89748
        for ks, vs in vb.copy().items():
            if ks in ['author', 'lines']: continue
            bks[kb]['lines'] += 1
            if (vs['content'] in prevs['content'] or \
                    prevs['content'] in vs['content']) and \
                    prevs['type'] == vs['type']:
                bks[kb].pop(preks)
                #if vs['content'] !=  prevs['content']:
                #    print('prevs',prevs['type'],prevs['content'])
                #    print('   vs',   vs['type'],   vs['content'])

            preks = ks
            prevs = vs

    return bks

def add_note_to_highlight(bks):
    """ append note content to corresponding highlight
    and remove NT sections

    Args:
        bks: books dict
    Return:
        changed books
    """
    [preks,prevs] = ['',{'content':'!#$%^&$%','type':'xx'}]
    for kb,vb in bks.items():
        for ks,vs in vb.copy().items():
            if ks in ['author', 'lines']: continue
            if [prevs['type'], vs['type']] == ['HL','NT']:
                bks[kb][preks]['content'] += str(NTPREF+vs['content'])
                bks[kb].pop(ks)

            preks = ks
            prevs = vs

    return bks

def search_clip(bks, s, t='ALL', p='ALL'):
    """search clip, searching scope may be title/author/content
    Args:
        input: bks: books dict
               s: key word
               t: 'ALL'
                  'HL'
                  'BM'
                  'NT'
               p: 'ALL'
                  'TITLE'
                  'AUTHOR'
                  'CONTENT'
    Return: search clipping content
    """
    nbks = defaultdict(dict)
    nu = 0
    for kb,vb in bks.items():
        nbks[kb]['lines'] = 0
        for ks,vs in vb.copy().items():
            if ks in ['author', 'lines']:
                nbks[kb][ks] = vs
                continue
            if t in ['ALL', vs['type']]:
                scopestr = {'ALL':''.join([kb,vb['author'],vs['content']]), \
                        'TITLE':kb, 'AUTHOR':vb['author'], 'CONTENT':vs['content']}
                found = re.search(s, scopestr[p])
                if found:
                    nbks[kb][ks] = vs
                    nbks[kb]['lines'] += 1
                    nu += 1
        if nbks[kb]['lines']==0:
            nbks.pop(kb)

    return [nu,nbks]

# to be implement
def statistic(bks):
    pass

def dict2json(d):
    """convert dict to json
    Args: d is the dict
    Return: json string
    """
    jstr = json.dumps(d)
    return jstr

def json2dict(jf):
    """convert dict to json
    Args: jf is the file saved json string
    Return: dict
    """
    d = {}
    with open(jf, 'r', encoding='utf8', errors='ignore') as f:
        d=json.load(f)
    return d

def import_clips():
    # 4 lines for each section seperated with '======='
    # so read 4 lines before '======='

    # loop to fill books dict
    with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
        bks = defaultdict(dict)
        secd  = defaultdict(dict)
        sidx  = 0
        idx   = 0
        sec   = []
        for line in f.readlines():
            line = line.strip()
            if re.match(r'^\s*$',line): continue
            idx += 1

            if not re.search(LASTLINE,line):
                # content more than 1 line
                if idx>3:
                    sec[2] += str(' '+line)
                    #logger.debug('idx {} {}'.format(idx, sec[2]))
                else:
                    sec.append(line)
                    #logger.debug('idx {} {}'.format(idx, sec[idx-1]))
            else:
                idx   = 0
                sidx += 1

                # parsing section & fill data structure
                secd = parse_section(sec,sidx)

                if secd:
                    bn  = secd['bookname']
                    tpy = secd[bn][str(sidx)]['type']

                    bks[bn]['author'] = secd[bn]['author']
                    bks[bn][str(sidx)] = secd[bn][str(sidx)]

                    # not add note to highlight content here,
                    # because NT maybe duplicated, we need remove duplication record before
                    """
                    if tpy=='NT' and bks[bn][str(sidx-1)]['type']=='HL':
                        bks[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
                    """

                else: # BM or not correct format section
                    sidx -= 1

                # initial section for next section loop
                sec = []
    return bks


if __name__=='__main__':
    #books = defaultdict(dict)
    books = import_clips()

    # remove duplication
    drop_duplicate(books)

    # test search note function
    searchnote = search_clip(books, '三大都市圈', 'ALL', 'CONTENT')
    if searchnote[0] > 0: format_out(searchnote[1], 'searchcontent', ft='MD')
    searchnote = search_clip(books, '经济', 'ALL', 'TITLE')
    if searchnote[0] > 0: format_out(searchnote[1], 'searchtitle', ft='MD')
    searchnote = search_clip(books, '巴曙松', 'ALL', 'AUTHOR')
    if searchnote[0] > 0: format_out(searchnote[1], 'searchauthor', ft='MD')

    # add note content to hightlight, then delete note
    add_note_to_highlight(books)

    # test dict json convert
    with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
        fw.write(dict2json(books))
    if json2dict('./xx')==books: print( 'test OK')

    format_out(books, OUTPREF, ft='MD')

    # print data with json format
    logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))