kman/mobiparse/mobi/makencx.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from loguru import logger
from collections import defaultdict

from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str
from .compatibility_utils import unicode_argv, add_cp65001_codec

K8_BOUNDARY = b"BOUNDARY"
""" The section data that divides K8 mobi ebooks. """

class unpackException(Exception):
    pass

# import the kindleunpack support libraries
from .unpack_structure import fileNames
from .mobi_sectioner import Sectionizer
from .mobi_header import MobiHeader
from .mobi_ncx import ncxExtract


# input mobi file path
# output ncx dict
def extractNcx(infile):
    infile = unicode_str(infile)
    mhdict = defaultdict(dict)

    # process the PalmDoc database header and verify it is a mobi
    sect = Sectionizer(infile)
    if sect.ident != b"BOOKMOBI" and sect.ident != b"TEXtREAd":
        raise unpackException("Invalid file format")

    logger.debug( "dumppalmheader ...")
    sect.dumppalmheader()

    # CGDBG
    print('infile {} '.format(infile))
    print('sect.dumpsectionsinfo() {}'.format(sect.dumpsectionsinfo()))
    print('sect.dumppalmheader() {}'.format(sect.dumppalmheader()))

    # scan sections to see if this is a compound mobi file (K8 format)
    # and build a list of all mobi headers to process.
    mhlst = []

    # CG mobi header
    mh = MobiHeader(sect, 0)
    metadata = mh.getMetaData()

    # if this is a mobi8-only file hasK8 here will be true
    mhlst.append(mh)
    K8Boundary = -1

    if mh.isK8():
        logger.debug("Unpacking a KF8 book...")
        hasK8 = True
    else:
        # CGDBG
        # This is either a Mobipocket 7 or earlier, or a combi M7/KF8
        # Find out which
        hasK8 = False
        for i in range(len(sect.sectionoffsets) - 1):
            before, after = sect.sectionoffsets[i : i + 2]
            if (after - before) == 8:
                data = sect.loadSection(i)
                if data == K8_BOUNDARY:
                    sect.setsectiondescription(i, "Mobi/KF8 Boundary Section")
                    mh = MobiHeader(sect, i + 1)
                    hasK8 = True   # K8
                    mhlst.append(mh)
                    K8Boundary = i
                    break

        # hasK8 header information include K8
        if hasK8:
            logger.debug( "Unpacking a Combination M{0:d}/KF8 book...".format(mh.version))
        else:
            logger.debug("Unpacking a Mobipocket {0:d} book...".format(mh.version))

        # loop for process ncx and write to json with filename - booname.ncx.json
        for tmh in mhlst:
            # CG
            # process the toc ncx
            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
            logger.debug("Processing ncx / toc ")
            print('hasK8 {} tmh.isK8 {}'.format(hasK8, tmh.isK8()))

            ncx = ncxExtract(tmh)
            ncx_data = ncx.parseNCX()

            # check the mobi header information is K8 or K7
            kn = 'k8ncx' if tmh.isK8() else 'k7ncx'
            mhdict[kn] = ncx_data

        return mhdict