#############################################
##   PROGRAM: file2.py
##   AUTHOR:  Chengan
##   CREATE:  20200526
##   douboer@gmail.com
#############################################

import platform
import re
import json
import logging
from collections import defaultdict

# data structure - use dict
'''
books =
{
    "bookname_xxx": {
        "author": "李",
        "section1636": {
            "content": "张",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "311-311",
            "time": "3:00:53",
            "type": "HL",
            "week": "星期五"
        },
        "section1651": {
            "content": "治",
            "day": "2020年4月3日",
            "meridiem": "下午",
            "position": "514",
            "time": "3:43:50",
            "type": "NT",
            "week": "星期五"
        },
        "section1814": {
            "content": null,
            "day": "2020年4月12日",
            "meridiem": "下午",
            "position": "5186",
            "time": "2:20:12",
            "type": "BM",
            "week": "星期日"
        },
        ...
    },
    ...
}
'''

# modi clippath for different os
SYS = 'WIN' if platform.system()=='Windows' else \
   ('LINUX' if platform.system()=='LINUX' else 'MAC')

# some constants
LASTLINE = '=========='
NTPREF   = '--CG注:'
CLIPPATH = './My Clippings.txt' # /Volumes/Kindle/documents/My\ Clippings.txt
STAT     = 'NONE'
DEBUG    = 1   # 0 - INFO; 1 - DEBUG
LOG2FILE = 1   # 0 - to stdio; 1 - to file

# log info
logger = logging.getLogger()
#formatter = logging.Formatter
#    ('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
formatter = logging.Formatter('')

if LOG2FILE:
    handler = logging.FileHandler("log")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
else:
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)

if DEBUG:
    logger.setLevel(logging.DEBUG)

#author & bookname info
au = re.compile(
r'''
^\ufeff*
(.+) \(         #bookname
(.+)\)          #author
''', flags=re.X )

# page & date info
#\(\d\+-\{0,1}\d\+\).\+\(\d\{4}年\d\{1,2}月\d\{1,2}日\)\(星期.\) \(..\)\(\d\{1,2}:\d\{1,2}:\d\{1,2}\)
da = re.compile(
r'''
\#
(\d+-{0,1}\d+)                 #group1 - page
.+
(笔记|标注|书签)               #group2 - type
.+
(\d{4}年\d{1,2}月\d{1,2}日)    #group3 - xxxx年xx月xx日
(星期.)                        #group4 - week
\s
(..)                           #group5 - pm/am
(\d{1,2}:\d{1,2}:\d{1,2})      #group6 - time
''', flags=re.X )

# input: section dict & and section index
# return: dict
#   d = { 'bookname':bookname,
#          bookname: {
#             'author':author
#             'section0':{
#                 'type':'HL',
#                 'position':'123',
#                 'day':'2020年5月26日',
#                 'week':'星期二',
#                 'meridiem':'PM',
#                 'time':'10:26:31'
#                 'content':content }}}
def parse_section(sec,idx):
    # 1. highlight over the picture, the content(#3 line) is empty, only two lines
    # 2. bookmark section only two lines
    # 3. other not correct format < 2
    if len(sec)<=2:
        return False

    # parse #2 line
    section  = defaultdict(dict)
    authinfo = sec[0]
    dateinfo = sec[1]
    content  = sec[2] if len(sec)==3 else None

    das = da.search(dateinfo)
    # type of section
    '''
    STAT :
        START  - start line of section
        BM     - section is a bookmark
        HL     - section is a highlight
        NT     - section is a note
    '''
    tpy   = ('HL' if das.group(2)=='标注' else \
            ('NT' if das.group(2)=='笔记' else 'BM'))
    pos   = das.group(1)
    day   = das.group(3)
    week  = das.group(4)
    pmam  = das.group(5)
    time  = das.group(6)

    # parse #1 line
    aus = au.search(authinfo)
    bookname = aus.group(1)
    author   = aus.group(2)
    section[bookname]['author'] = author

    section['bookname'] = bookname
    section[bookname][str(idx)] = {
            'type':tpy,
            'position':pos,
            'day':day,
            'week':week,
            'meridiem':pmam,
            'time':time,
            'content':content }

    return section

# format output
# input: books - dict
#        f - 'MD'
#            'TXT'
#            'JSON'
# output: 
#
def formmat_out(books,f='MD'):
    pass

# search clip, searching scope may be title/author/content
# input: books - dict
#        s - key word
#        t - 'ALL'
#            'HL'
#            'BM'
#            'NT'
#        p - 'ALL'
#            'TITLE'
#            'AUTHOR'
#            'CONTENT'
# output: 
#
def search_clip(books, s, t='ALL', p='ALL'):
    pass

if __name__ == '__main__':
    # 4 lines for each section seperated with '======='
    # so read 4 lines before '======='
    with open(CLIPPATH, 'r', encoding='utf8', errors='ignore') as f:
        books   = defaultdict(dict)
        secd    = defaultdict(dict)
        sidx    = 0
        idx     = 0
        sec     = []
        STAT    = 'START'
        for line in f.readlines():
            line = line.strip()
            if re.match(r'^\s*$',line): continue
            idx += 1

            if not re.search(LASTLINE,line):
                # content more than 1 line
                if idx>3:
                    sec[2] += str(' '+line)
                    logger.debug('idx {} {}'.format(idx, sec[2]))
                else:
                    sec.append(line)
                    logger.debug('idx {} {}'.format(idx, sec[idx-1]))
            else:
                idx   = 0
                sidx += 1

                # parsing section & fill data structure
                secd = parse_section(sec,sidx)

                if secd:
                    bn  = secd['bookname']
                    tpy = secd[bn][str(sidx)]['type']

                    books[bn]['author'] = secd[bn]['author']
                    books[bn][str(sidx)] = secd[bn][str(sidx)]

                    if tpy=='NT' and books[bn][str(sidx-1)]['type']=='HL':
                        books[bn][str(sidx-1)]['content'] += str(NTPREF+sec[2])
                else: # BM or not correct format section
                    sidx -= 1

                # initial section for next section loop
                sec = []

    # print data with json format
    logger.debug(json.dumps(books, indent=4, sort_keys=True, ensure_ascii=False))