kman/mobiparse/mobi/mobi_ncx.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

import os
from .unipath import pathof
from loguru import logger


import re
import json

# note: re requites the pattern to be the exact same type as the data to be searched in python3
# but u"" is not allowed for the pattern itself only b""

'''
NCX (Navigation Control for XML applications) is a generalized navigation definition DTD for application
to Digital Talking Books, eBooks, and general web content models.
This DTD is an XML application that layers navigation functionality on top of SMIL 2.0  content.
The NCX defines a navigation path/model that may be applied upon existing publications,
without modification of the existing publication source, so long as the navigation targets within
the source publication can be directly referenced via a URI.

http://www.daisy.org/z3986/2005/ncx-2005-1.dtd
'''

from .mobi_utils import toBase32
from .mobi_index import MobiIndex

DEBUG_NCX = False

class ncxExtract:
    def __init__(self, mh):
        self.mh = mh
        self.sect = self.mh.sect
        self.isNCX = False
        self.mi = MobiIndex(self.sect)
        self.ncxidx = self.mh.ncxidx
        self.indx_data = None

    def parseNCX(self):
        indx_data = []
        tag_fieldname_map = {
            1: ["pos", 0],
            2: ["len", 0],
            3: ["noffs", 0],
            4: ["hlvl", 0],
            5: ["koffs", 0],
            6: ["pos_fid", 0],
            21: ["parent", 0],
            22: ["child1", 0],
            23: ["childn", 0],
        }
        if self.ncxidx != 0xFFFFFFFF:
            outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
            if DEBUG_NCX:
                logger.debug("ctoc_text {}".format(ctoc_text))
                logger.debug("outtbl {}".format(outtbl))
            num = 0
            for [text, tagMap] in outtbl:
                tmp = {
                    "name": text.decode("utf-8"),
                    "pos": -1,
                    "len": 0,
                    "noffs": -1,
                    "text": "Unknown Text",
                    "hlvl": -1,
                    "kind": "Unknown Kind",
                    "pos_fid": None,
                    "parent": -1,
                    "child1": -1,
                    "childn": -1,
                    "num": num,
                }
                for tag in tag_fieldname_map:
                    [fieldname, i] = tag_fieldname_map[tag]
                    if tag in tagMap:
                        fieldvalue = tagMap[tag][i]
                        if tag == 6:
                            pos_fid = toBase32(fieldvalue, 4).decode("utf-8")
                            fieldvalue2 = tagMap[tag][i + 1]
                            pos_off = toBase32(fieldvalue2, 10).decode("utf-8")
                            fieldvalue = "kindle:pos:fid:%s:off:%s" % (pos_fid, pos_off)
                        tmp[fieldname] = fieldvalue
                        if tag == 3:
                            toctext = ctoc_text.get(fieldvalue, "Unknown Text")
                            toctext = toctext.decode(self.mh.codec)
                            tmp["text"] = toctext
                        if tag == 5:
                            kindtext = ctoc_text.get(fieldvalue, "Unknown Kind")
                            kindtext = kindtext.decode(self.mh.codec)
                            tmp["kind"] = kindtext
                indx_data.append(tmp)

                # CGDBG
                '''
                record number:  3
                name:  03
                position 461377  length:  465358  => position/150 = real page number
                text:  第二章 青铜时代——单机游戏
                kind:  Unknown Kind
                heading level:  0 => level of section
                parent: -1  => record number of previous level of section
                first child:  15  last child:  26 => range of record number of next level section
                pos_fid is  kindle:pos:fid:0023:off:0000000000
                '''
                if DEBUG_NCX:
                    print("record number: ", num)
                    print(
                        "name: ", tmp["name"],
                    )
                    print("position", tmp["pos"], " length: ", tmp["len"])
                    print("text: ", tmp["text"])
                    print("kind: ", tmp["kind"])
                    print("heading level: ", tmp["hlvl"])
                    print("parent:", tmp["parent"])
                    print(
                        "first child: ", tmp["child1"], " last child: ", tmp["childn"]
                    )
                    print("pos_fid is ", tmp["pos_fid"])
                    print("\n\n")
                num += 1
        self.indx_data = indx_data

        # {'name': '00', 'pos': 167, 'len': 24798, 'noffs': 0, 'text': '版权信息', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 0}
        # {'name': '0B', 'pos': 67932, 'len': 3274, 'noffs': 236, 'text': '8.希罗多德', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 11}
        #print('indx_data {}'.format(json.dumps(indx_data, indent=4, sort_keys=True, ensure_ascii=False)))

        return indx_data