kman/kman.py


#########################################################
## @file   : kman.py
## @desc   : kindle note managerment tool
## @create : 20200526
## @author : Chengan
## @email  : douboer@gmail.com
#########################################################

import re
import json
import logging
import platform
from collections import defaultdict

# data structure - use dict
'''
books =
{
    "bookname_xxx": {
        "author": "李",
        "section1636": {
            "content": "张",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "311-311",
            "time": "3:00:53",
            "type": "HL",
            "week": "星期五"
        },
        "section1651": {
            "content": "治",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "514",
            "time": "3:43:50",
            "type": "NT",
            "week": "星期五"
        },
        "section1814": {
            "content": null,
            "day": "2020年4月12日",
            "meridiem": "下午",
            "position": "5186",
            "time": "2:20:12",
            "type": "BM",
            "week": "星期日"
        },
        ...
    },
    ...
}
'''

# modi clippath for different os
SYS = 'WIN' if platform.system()=='Windows' else \
   ('LINUX' if platform.system()=='LINUX' else 'MAC')

# some constants
LASTLINE = '=========='
NTPREF   = '--CG注:'
CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
OUTPREF  = './clip'
DEBUG    = 1   # 0 - INFO; 1 - DEBUG
LOG2FILE = 1   # 0 - to stdio; 1 - to file

# log info
logger = logging.getLogger()
#formatter = logging.Formatter
#    ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
formatter = logging.Formatter('')

if LOG2FILE:
    handler = logging.FileHandler("log")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
else:
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)

if DEBUG:
    logger.setLevel(logging.DEBUG)

#author & bookname info
au = re.compile(
r'''
^\ufeff*
(.+) \(         #bookname
(.+)\)          #author
''', flags=re.X )

# page & date info
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
da = re.compile(
r'''
\#
(\d+-{0,1}\d+)                 #group1 - page
.+
(笔记|标注|书签)               #group2 - type
.+
(\d{4}年\d{1,2}月\d{1,2}日)    #group3 - xxxx年xx月xx日
(星期.)                        #group4 - week
\s
(..)                           #group5 - pm/am
(\d{1,2}:\d{1,2}:\d{1,2})      #group6 - time
''', flags=re.X )

def parse_section(s,i):
    """parse section

    Args:
        s: section dict
        i: section index

    Returns:
        dict like this:
        d = { 'bookname':bookname,
               bookname: {
                  'author':author
                  'section0':{
                      'type':'HL',
                      'position':'123',
                      'day':'2020年5月26日',
                      'week':'星期二',
                      'meridiem':'PM',
                      'time':'10:26:31'
                      'content':content }}}
    """


    # 1. highlight over the picture, the content(#3 line) is empty, only two lines
    # 2. bookmark section only two lines
    # 3. other not correct format < 2
    if len(s)<=2:
        return False

    # parse #2 line
    section  = defaultdict(dict)
    authinfo = sec[0]
    dateinfo = sec[1]
    content  = sec[2] if len(sec)==3 else None

    das = da.search(dateinfo)
    # type of section
    '''
    STAT :
        START  - start line of section
        BM     - section is a bookmark
        HL     - section is a highlight
        NT     - section is a note
    '''
    tpy  = ('HL' if das.group(2)=='标注' else \
           ('NT' if das.group(2)=='笔记' else 'BM'))
    pos  = das.group(1)
    day  = das.group(3)
    week = das.group(4)
    pmam = das.group(5)
    time = das.group(6)

    # parse #1 line
    aus = au.search(authinfo)
    bookname = aus.group(1)
    author   = aus.group(2)
    section[bookname]['author'] = author

    section['bookname'] = bookname
    section[bookname][str(i)] = {
            'type':tpy,
            'position':pos,
            'day':day,
            'week':week,
            'meridiem':pmam,
            'time':time,
            'content':content }

    return section

def format_out(bks, ft='MD'):
    """format output and write to file
    MARKDOWN format:
    TYPE | bookname | author | marktime | content
    --|--|--|--|--
    xx|xx|xx|xx|xx

    CSV format:
    TYPE,bookname,author,marktime,content
    xx,xx,xx,xx,xx

    marktime: 20200403 PM 3:0:3 星期五

    Args:
        bks: books dict
        f: can be 'MD'/'JSON'/'CSV'

    Returns: special format of 'bks' dict
    """

    suff = {'MD':'.md','CSV':'.csv','JSON':'.json'}
    op = OUTPREF+suff[ft]

    with open(op, 'w', encoding='gbk', errors='ignore') as fw:
        if ft=='JSON':
            ft.write(json.dumps(bks, indent=4, sort_keys=True, ensure_ascii=False))
        elif ft=='MD':
            pass
        else:
            ft.write(json.dumps(bks)) # only for load back

def statistic(bks):
    pass

def dict2json(d):
    """convert dict to json
    Args: d is the dict
    Return: json string
    """
    jstr = json.dumps(d)
    return jstr

def json2dict(jf):
    """convert dict to json
    Args: jf is the file saved json string
    Return: dict
    """
    d = {}
    with open(jf, 'r', encoding='utf8', errors='ignore') as f:
        d=json.load(f)
    return d

def search_clip(bks, s, t='ALL', p='ALL'):
    """search clip, searching scope may be title/author/content
    Args:
        input: bks: books dict
               s: key word
               t: 'ALL'
                  'HL'
                  'BM'
                  'NT'
               p: 'ALL'
                  'TITLE'
                  'AUTHOR'
                  'CONTENT'
    Return: search clipping content
    """
    pass

if __name__=='__main__':
    # 4 lines for each section seperated with '======='
    # so read 4 lines before '======='
    with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
        books = defaultdict(dict)
        secd  = defaultdict(dict)
        sidx  = 0
        idx   = 0
        sec   = []
        for line in f.readlines():
            line = line.strip()
            if re.match(r'^\s*$',line): continue
            idx += 1

            if not re.search(LASTLINE,line):
                # content more than 1 line
                if idx>3:
                    sec[2] += str(' '+line)
                    logger.debug('idx {} {}'.format(idx, sec[2]))
                else:
                    sec.append(line)
                    logger.debug('idx {} {}'.format(idx, sec[idx-1]))
            else:
                idx   = 0
                sidx += 1

                # parsing section & fill data structure
                secd = parse_section(sec,sidx)

                if secd:
                    bn  = secd['bookname']
                    tpy = secd[bn][str(sidx)]['type']

                    books[bn]['author'] = secd[bn]['author']
                    books[bn][str(sidx)] = secd[bn][str(sidx)]

                    if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL':
                        books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
                else: # BM or not correct format section
                    sidx -= 1

                # initial section for next section loop
                sec = []

    # test dict json convert
    with open('./xx', 'w', encoding='utf8', errors='ignore') as fw:
        fw.write(dict2json(books))
    if json2dict('./xx')==books: print( 'test OK')

# print data with json format
logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))