kman/mobiparse/mobi/mobi_header.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
""" set to True to use OrderedDict for MobiHeader.metadata."""

if DEBUG_USE_ORDERED_DICTIONARY:
    from collections import OrderedDict as dict_
else:
    dict_ = dict

from .compatibility_utils import PY2, unicode_str, hexlify, bord
from loguru import logger

if PY2:
    range = xrange

import struct
import uuid

# import the mobiunpack support libraries
from .mobi_utils import getLanguage
from .mobi_uncompress import HuffcdicReader, PalmdocReader, UncompressedReader


class unpackException(Exception):
    pass


def sortedHeaderKeys(mheader):
    hdrkeys = sorted(list(mheader.keys()), key=lambda akey: mheader[akey][0])
    return hdrkeys


# HD Containers have their own headers and their own EXTH
# this is just guesswork so far, making big assumption that
# metavalue key numbers remain the same in the CONT EXTH

# Note:  The layout of the CONT Header is still unknown
# so just deal with their EXTH sections for now


def dump_contexth(cpage, extheader):
    # determine text encoding
    codec = "windows-1252"
    codec_map = {
        1252: "windows-1252",
        65001: "utf-8",
    }
    if cpage in codec_map:
        codec = codec_map[cpage]
    if extheader == b"":
        return
    id_map_strings = {
        1: "Drm Server Id",
        2: "Drm Commerce Id",
        3: "Drm Ebookbase Book Id",
        4: "Drm Ebookbase Dep Id",
        100: "Creator",
        101: "Publisher",
        102: "Imprint",
        103: "Description",
        104: "ISBN",
        105: "Subject",
        106: "Published",
        107: "Review",
        108: "Contributor",
        109: "Rights",
        110: "SubjectCode",
        111: "Type",
        112: "Source",
        113: "ASIN",
        114: "versionNumber",
        117: "Adult",
        118: "Retail-Price",
        119: "Retail-Currency",
        120: "TSC",
        122: "fixed-layout",
        123: "book-type",
        124: "orientation-lock",
        126: "original-resolution",
        127: "zero-gutter",
        128: "zero-margin",
        129: "MetadataResourceURI",
        132: "RegionMagnification",
        150: "LendingEnabled",
        200: "DictShortName",
        501: "cdeType",
        502: "last_update_time",
        503: "Updated_Title",
        504: "CDEContentKey",
        505: "AmazonContentReference",
        506: "Title-Language",
        507: "Title-Display-Direction",
        508: "Title-Pronunciation",
        509: "Title-Collation",
        510: "Secondary-Title",
        511: "Secondary-Title-Language",
        512: "Secondary-Title-Direction",
        513: "Secondary-Title-Pronunciation",
        514: "Secondary-Title-Collation",
        515: "Author-Language",
        516: "Author-Display-Direction",
        517: "Author-Pronunciation",
        518: "Author-Collation",
        519: "Author-Type",
        520: "Publisher-Language",
        521: "Publisher-Display-Direction",
        522: "Publisher-Pronunciation",
        523: "Publisher-Collation",
        524: "Content-Language-Tag",
        525: "primary-writing-mode",
        526: "NCX-Ingested-By-Software",
        527: "page-progression-direction",
        528: "override-kindle-fonts",
        529: "Compression-Upgraded",
        530: "Soft-Hyphens-In-Content",
        531: "Dictionary_In_Langague",
        532: "Dictionary_Out_Language",
        533: "Font_Converted",
        534: "Amazon_Creator_Info",
        535: "Creator-Build-Tag",
        536: "HD-Media-Containers-Info",  # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
        538: "Resource-Container-Fidelity",
        539: "HD-Container-Mimetype",
        540: "Sample-For_Special-Purpose",
        541: "Kindletool-Operation-Information",
        542: "Container_Id",
        543: "Asset-Type",  # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
        544: "Unknown_544",
    }
    id_map_values = {
        115: "sample",
        116: "StartOffset",
        121: "Mobi8-Boundary-Section",
        125: "Embedded-Record-Count",
        130: "Offline-Sample",
        131: "Metadata-Record-Offset",
        201: "CoverOffset",
        202: "ThumbOffset",
        203: "HasFakeCover",
        204: "Creator-Software",
        205: "Creator-Major-Version",
        206: "Creator-Minor-Version",
        207: "Creator-Build-Number",
        401: "Clipping-Limit",
        402: "Publisher-Limit",
        404: "Text-to-Speech-Disabled",
        406: "Rental-Expiration-Time",
    }
    id_map_hexstrings = {
        208: "Watermark_(hex)",
        209: "Tamper-Proof-Keys_(hex)",
        300: "Font-Signature_(hex)",
        403: "Unknown_(403)_(hex)",
        405: "Ownership-Type_(hex)",
        407: "Unknown_(407)_(hex)",
        420: "Multimedia-Content-Reference_(hex)",
        450: "Locations_Match_(hex)",
        451: "Full-Story-Length_(hex)",
        452: "Sample-Start_Location_(hex)",
        453: "Sample-End-Location_(hex)",
    }
    _length, num_items = struct.unpack(b">LL", extheader[4:12])
    extheader = extheader[12:]
    pos = 0
    for _ in range(num_items):
        id, size = struct.unpack(b">LL", extheader[pos : pos + 8])
        content = extheader[pos + 8 : pos + size]
        if id in id_map_strings:
            name = id_map_strings[id]
            logger.debug(
                '\n    Key: "%s"\n        Value: "%s"'
                % (name, content.decode(codec, errors="replace"))
            )
        elif id in id_map_values:
            name = id_map_values[id]
            if size == 9:
                (value,) = struct.unpack(b"B", content)
                logger.debug('\n    Key: "%s"\n        Value: 0x%01x' % (name, value))
            elif size == 10:
                (value,) = struct.unpack(b">H", content)
                logger.debug('\n    Key: "%s"\n        Value: 0x%02x' % (name, value))
            elif size == 12:
                (value,) = struct.unpack(b">L", content)
                logger.debug('\n    Key: "%s"\n        Value: 0x%04x' % (name, value))
            else:
                logger.debug(
                    "\nError: Value for %s has unexpected size of %s" % (name, size)
                )
        elif id in id_map_hexstrings:
            name = id_map_hexstrings[id]
            logger.debug(
                '\n    Key: "%s"\n        Value: 0x%s' % (name, hexlify(content))
            )
        else:
            logger.debug("\nWarning: Unknown metadata with id %s found" % id)
            name = str(id) + " (hex)"
            logger.debug(
                '    Key: "%s"\n        Value: 0x%s' % (name, hexlify(content))
            )
        pos += size
    return


class MobiHeader:
    # all values are packed in big endian format
    palmdoc_header = {
        "compression_type": (0x00, b">H", 2),
        "fill0": (0x02, b">H", 2),
        "text_length": (0x04, b">L", 4),
        "text_records": (0x08, b">H", 2),
        "max_section_size": (0x0A, b">H", 2),
        "read_pos   ": (0x0C, b">L", 4),
    }

    mobi6_header = {
        "compression_type": (0x00, b">H", 2),
        "fill0": (0x02, b">H", 2),
        "text_length": (0x04, b">L", 4),
        "text_records": (0x08, b">H", 2),
        "max_section_size": (0x0A, b">H", 2),
        "crypto_type": (0x0C, b">H", 2),
        "fill1": (0x0E, b">H", 2),
        "magic": (0x10, b"4s", 4),
        "header_length (from MOBI)": (0x14, b">L", 4),
        "type": (0x18, b">L", 4),
        "codepage": (0x1C, b">L", 4),
        "unique_id": (0x20, b">L", 4),
        "version": (0x24, b">L", 4),
        "metaorthindex": (0x28, b">L", 4),
        "metainflindex": (0x2C, b">L", 4),
        "index_names": (0x30, b">L", 4),
        "index_keys": (0x34, b">L", 4),
        "extra_index0": (0x38, b">L", 4),
        "extra_index1": (0x3C, b">L", 4),
        "extra_index2": (0x40, b">L", 4),
        "extra_index3": (0x44, b">L", 4),
        "extra_index4": (0x48, b">L", 4),
        "extra_index5": (0x4C, b">L", 4),
        "first_nontext": (0x50, b">L", 4),
        "title_offset": (0x54, b">L", 4),
        "title_length": (0x58, b">L", 4),
        "language_code": (0x5C, b">L", 4),
        "dict_in_lang": (0x60, b">L", 4),
        "dict_out_lang": (0x64, b">L", 4),
        "min_version": (0x68, b">L", 4),
        "first_resc_offset": (0x6C, b">L", 4),
        "huff_offset": (0x70, b">L", 4),
        "huff_num": (0x74, b">L", 4),
        "huff_tbl_offset": (0x78, b">L", 4),
        "huff_tbl_len": (0x7C, b">L", 4),
        "exth_flags": (0x80, b">L", 4),
        "fill3_a": (0x84, b">L", 4),
        "fill3_b": (0x88, b">L", 4),
        "fill3_c": (0x8C, b">L", 4),
        "fill3_d": (0x90, b">L", 4),
        "fill3_e": (0x94, b">L", 4),
        "fill3_f": (0x98, b">L", 4),
        "fill3_g": (0x9C, b">L", 4),
        "fill3_h": (0xA0, b">L", 4),
        "unknown0": (0xA4, b">L", 4),
        "drm_offset": (0xA8, b">L", 4),
        "drm_count": (0xAC, b">L", 4),
        "drm_size": (0xB0, b">L", 4),
        "drm_flags": (0xB4, b">L", 4),
        "fill4_a": (0xB8, b">L", 4),
        "fill4_b": (0xBC, b">L", 4),
        "first_content": (0xC0, b">H", 2),
        "last_content": (0xC2, b">H", 2),
        "unknown0": (0xC4, b">L", 4),
        "fcis_offset": (0xC8, b">L", 4),
        "fcis_count": (0xCC, b">L", 4),
        "flis_offset": (0xD0, b">L", 4),
        "flis_count": (0xD4, b">L", 4),
        "unknown1": (0xD8, b">L", 4),
        "unknown2": (0xDC, b">L", 4),
        "srcs_offset": (0xE0, b">L", 4),
        "srcs_count": (0xE4, b">L", 4),
        "unknown3": (0xE8, b">L", 4),
        "unknown4": (0xEC, b">L", 4),
        "fill5": (0xF0, b">H", 2),
        "traildata_flags": (0xF2, b">H", 2),
        "ncx_index": (0xF4, b">L", 4),
        "unknown5": (0xF8, b">L", 4),
        "unknown6": (0xFC, b">L", 4),
        "datp_offset": (0x100, b">L", 4),
        "unknown7": (0x104, b">L", 4),
        "Unknown    ": (0x108, b">L", 4),
        "Unknown    ": (0x10C, b">L", 4),
        "Unknown    ": (0x110, b">L", 4),
        "Unknown    ": (0x114, b">L", 4),
        "Unknown    ": (0x118, b">L", 4),
        "Unknown    ": (0x11C, b">L", 4),
        "Unknown    ": (0x120, b">L", 4),
        "Unknown    ": (0x124, b">L", 4),
        "Unknown    ": (0x128, b">L", 4),
        "Unknown    ": (0x12C, b">L", 4),
        "Unknown    ": (0x130, b">L", 4),
        "Unknown    ": (0x134, b">L", 4),
        "Unknown    ": (0x138, b">L", 4),
        "Unknown    ": (0x11C, b">L", 4),
    }

    mobi8_header = {
        "compression_type": (0x00, b">H", 2),
        "fill0": (0x02, b">H", 2),
        "text_length": (0x04, b">L", 4),
        "text_records": (0x08, b">H", 2),
        "max_section_size": (0x0A, b">H", 2),
        "crypto_type": (0x0C, b">H", 2),
        "fill1": (0x0E, b">H", 2),
        "magic": (0x10, b"4s", 4),
        "header_length (from MOBI)": (0x14, b">L", 4),
        "type": (0x18, b">L", 4),
        "codepage": (0x1C, b">L", 4),
        "unique_id": (0x20, b">L", 4),
        "version": (0x24, b">L", 4),
        "metaorthindex": (0x28, b">L", 4),
        "metainflindex": (0x2C, b">L", 4),
        "index_names": (0x30, b">L", 4),
        "index_keys": (0x34, b">L", 4),
        "extra_index0": (0x38, b">L", 4),
        "extra_index1": (0x3C, b">L", 4),
        "extra_index2": (0x40, b">L", 4),
        "extra_index3": (0x44, b">L", 4),
        "extra_index4": (0x48, b">L", 4),
        "extra_index5": (0x4C, b">L", 4),
        "first_nontext": (0x50, b">L", 4),
        "title_offset": (0x54, b">L", 4),
        "title_length": (0x58, b">L", 4),
        "language_code": (0x5C, b">L", 4),
        "dict_in_lang": (0x60, b">L", 4),
        "dict_out_lang": (0x64, b">L", 4),
        "min_version": (0x68, b">L", 4),
        "first_resc_offset": (0x6C, b">L", 4),
        "huff_offset": (0x70, b">L", 4),
        "huff_num": (0x74, b">L", 4),
        "huff_tbl_offset": (0x78, b">L", 4),
        "huff_tbl_len": (0x7C, b">L", 4),
        "exth_flags": (0x80, b">L", 4),
        "fill3_a": (0x84, b">L", 4),
        "fill3_b": (0x88, b">L", 4),
        "fill3_c": (0x8C, b">L", 4),
        "fill3_d": (0x90, b">L", 4),
        "fill3_e": (0x94, b">L", 4),
        "fill3_f": (0x98, b">L", 4),
        "fill3_g": (0x9C, b">L", 4),
        "fill3_h": (0xA0, b">L", 4),
        "unknown0": (0xA4, b">L", 4),
        "drm_offset": (0xA8, b">L", 4),
        "drm_count": (0xAC, b">L", 4),
        "drm_size": (0xB0, b">L", 4),
        "drm_flags": (0xB4, b">L", 4),
        "fill4_a": (0xB8, b">L", 4),
        "fill4_b": (0xBC, b">L", 4),
        "fdst_offset": (0xC0, b">L", 4),
        "fdst_flow_count": (0xC4, b">L", 4),
        "fcis_offset": (0xC8, b">L", 4),
        "fcis_count": (0xCC, b">L", 4),
        "flis_offset": (0xD0, b">L", 4),
        "flis_count": (0xD4, b">L", 4),
        "unknown1": (0xD8, b">L", 4),
        "unknown2": (0xDC, b">L", 4),
        "srcs_offset": (0xE0, b">L", 4),
        "srcs_count": (0xE4, b">L", 4),
        "unknown3": (0xE8, b">L", 4),
        "unknown4": (0xEC, b">L", 4),
        "fill5": (0xF0, b">H", 2),
        "traildata_flags": (0xF2, b">H", 2),
        "ncx_index": (0xF4, b">L", 4),
        "fragment_index": (0xF8, b">L", 4),
        "skeleton_index": (0xFC, b">L", 4),
        "datp_offset": (0x100, b">L", 4),
        "guide_index": (0x104, b">L", 4),
        "Unknown    ": (0x108, b">L", 4),
        "Unknown    ": (0x10C, b">L", 4),
        "Unknown    ": (0x110, b">L", 4),
        "Unknown    ": (0x114, b">L", 4),
        "Unknown    ": (0x118, b">L", 4),
        "Unknown    ": (0x11C, b">L", 4),
        "Unknown    ": (0x120, b">L", 4),
        "Unknown    ": (0x124, b">L", 4),
        "Unknown    ": (0x128, b">L", 4),
        "Unknown    ": (0x12C, b">L", 4),
        "Unknown    ": (0x130, b">L", 4),
        "Unknown    ": (0x134, b">L", 4),
        "Unknown    ": (0x138, b">L", 4),
        "Unknown    ": (0x11C, b">L", 4),
    }

    palmdoc_header_sorted_keys = sortedHeaderKeys(palmdoc_header)
    mobi6_header_sorted_keys = sortedHeaderKeys(mobi6_header)
    mobi8_header_sorted_keys = sortedHeaderKeys(mobi8_header)

    id_map_strings = {
        1: "Drm Server Id",
        2: "Drm Commerce Id",
        3: "Drm Ebookbase Book Id",
        4: "Drm Ebookbase Dep Id",
        100: "Creator",
        101: "Publisher",
        102: "Imprint",
        103: "Description",
        104: "ISBN",
        105: "Subject",
        106: "Published",
        107: "Review",
        108: "Contributor",
        109: "Rights",
        110: "SubjectCode",
        111: "Type",
        112: "Source",
        113: "ASIN",
        114: "versionNumber",
        117: "Adult",
        118: "Retail-Price",
        119: "Retail-Currency",
        120: "TSC",
        122: "fixed-layout",
        123: "book-type",
        124: "orientation-lock",
        126: "original-resolution",
        127: "zero-gutter",
        128: "zero-margin",
        129: "MetadataResourceURI",
        132: "RegionMagnification",
        150: "LendingEnabled",
        200: "DictShortName",
        501: "cdeType",
        502: "last_update_time",
        503: "Updated_Title",
        504: "CDEContentKey",
        505: "AmazonContentReference",
        506: "Title-Language",
        507: "Title-Display-Direction",
        508: "Title-Pronunciation",
        509: "Title-Collation",
        510: "Secondary-Title",
        511: "Secondary-Title-Language",
        512: "Secondary-Title-Direction",
        513: "Secondary-Title-Pronunciation",
        514: "Secondary-Title-Collation",
        515: "Author-Language",
        516: "Author-Display-Direction",
        517: "Author-Pronunciation",
        518: "Author-Collation",
        519: "Author-Type",
        520: "Publisher-Language",
        521: "Publisher-Display-Direction",
        522: "Publisher-Pronunciation",
        523: "Publisher-Collation",
        524: "Content-Language-Tag",
        525: "primary-writing-mode",
        526: "NCX-Ingested-By-Software",
        527: "page-progression-direction",
        528: "override-kindle-fonts",
        529: "Compression-Upgraded",
        530: "Soft-Hyphens-In-Content",
        531: "Dictionary_In_Langague",
        532: "Dictionary_Out_Language",
        533: "Font_Converted",
        534: "Amazon_Creator_Info",
        535: "Creator-Build-Tag",
        536: "HD-Media-Containers-Info",  # CONT_Header is 0, Ends with CONTAINER_BOUNDARY (or Asset_Type?)
        538: "Resource-Container-Fidelity",
        539: "HD-Container-Mimetype",
        540: "Sample-For_Special-Purpose",
        541: "Kindletool-Operation-Information",
        542: "Container_Id",
        543: "Asset-Type",  # FONT_CONTAINER, BW_CONTAINER, HD_CONTAINER
        544: "Unknown_544",
    }
    id_map_values = {
        115: "sample",
        116: "StartOffset",
        121: "Mobi8-Boundary-Section",
        125: "Embedded-Record-Count",
        130: "Offline-Sample",
        131: "Metadata-Record-Offset",
        201: "CoverOffset",
        202: "ThumbOffset",
        203: "HasFakeCover",
        204: "Creator-Software",
        205: "Creator-Major-Version",
        206: "Creator-Minor-Version",
        207: "Creator-Build-Number",
        401: "Clipping-Limit",
        402: "Publisher-Limit",
        404: "Text-to-Speech-Disabled",
        406: "Rental-Expiration-Time",
    }
    id_map_hexstrings = {
        208: "Watermark_(hex)",
        209: "Tamper-Proof-Keys_(hex)",
        300: "Font-Signature_(hex)",
        403: "Unknown_(403)_(hex)",
        405: "Ownership-Type_(hex)",
        407: "Unknown_(407)_(hex)",
        420: "Multimedia-Content-Reference_(hex)",
        450: "Locations_Match_(hex)",
        451: "Full-Story-Length_(hex)",
        452: "Sample-Start_Location_(hex)",
        453: "Sample-End-Location_(hex)",
    }

    def __init__(self, sect, sectNumber):
        self.sect = sect
        self.start = sectNumber
        self.header = self.sect.loadSection(self.start)
        if len(self.header) > 20 and self.header[16:20] == b"MOBI":
            self.sect.setsectiondescription(0, "Mobipocket Header")
            self.palm = False
        elif self.sect.ident == b"TEXtREAd":
            self.sect.setsectiondescription(0, "PalmDOC Header")
            self.palm = True
        else:
            raise unpackException("Unknown File Format")

        (self.records,) = struct.unpack_from(b">H", self.header, 0x8)

        # set defaults in case this is a PalmDOC
        self.title = self.sect.palmname.decode("latin-1", errors="replace")
        self.length = len(self.header) - 16
        self.type = 3
        self.codepage = 1252
        self.codec = "windows-1252"
        self.unique_id = 0
        self.version = 0
        self.hasExth = False
        self.exth = b""
        self.exth_offset = self.length + 16
        self.exth_length = 0
        self.crypto_type = 0
        self.firstnontext = self.start + self.records + 1
        self.firstresource = self.start + self.records + 1
        self.ncxidx = 0xFFFFFFFF
        self.metaOrthIndex = 0xFFFFFFFF
        self.metaInflIndex = 0xFFFFFFFF
        self.skelidx = 0xFFFFFFFF
        self.fragidx = 0xFFFFFFFF
        self.guideidx = 0xFFFFFFFF
        self.fdst = 0xFFFFFFFF
        self.mlstart = self.sect.loadSection(self.start + 1)[:4]
        self.rawSize = 0
        self.metadata = dict_()

        # set up for decompression/unpacking
        (self.compression,) = struct.unpack_from(b">H", self.header, 0x0)
        if self.compression == 0x4448:
            reader = HuffcdicReader()
            huffoff, huffnum = struct.unpack_from(b">LL", self.header, 0x70)
            huffoff = huffoff + self.start
            self.sect.setsectiondescription(huffoff, "Huffman Compression Seed")
            reader.loadHuff(self.sect.loadSection(huffoff))
            for i in range(1, huffnum):
                self.sect.setsectiondescription(
                    huffoff + i, "Huffman CDIC Compression Seed %d" % i
                )
                reader.loadCdic(self.sect.loadSection(huffoff + i))
            self.unpack = reader.unpack
        elif self.compression == 2:
            self.unpack = PalmdocReader().unpack
        elif self.compression == 1:
            self.unpack = UncompressedReader().unpack
        else:
            raise unpackException("invalid compression type: 0x%4x" % self.compression)

        if self.palm:
            return

        (
            self.length,
            self.type,
            self.codepage,
            self.unique_id,
            self.version,
        ) = struct.unpack(b">LLLLL", self.header[20:40])
        codec_map = {
            1252: "windows-1252",
            65001: "utf-8",
        }
        if self.codepage in codec_map:
            self.codec = codec_map[self.codepage]

        # title
        toff, tlen = struct.unpack(b">II", self.header[0x54:0x5C])
        tend = toff + tlen
        self.title = self.header[toff:tend].decode(self.codec, errors="replace")

        (exth_flag,) = struct.unpack(b">L", self.header[0x80:0x84])
        self.hasExth = exth_flag & 0x40
        self.exth_offset = self.length + 16
        self.exth_length = 0
        if self.hasExth:
            (self.exth_length,) = struct.unpack_from(
                b">L", self.header, self.exth_offset + 4
            )
            self.exth_length = (
                (self.exth_length + 3) >> 2
            ) << 2  # round to next 4 byte boundary
            self.exth = self.header[
                self.exth_offset : self.exth_offset + self.exth_length
            ]

        # parse the exth / metadata
        self.parseMetaData()

        # self.mlstart = self.sect.loadSection(self.start+1)
        # self.mlstart = self.mlstart[0:4]
        (self.crypto_type,) = struct.unpack_from(b">H", self.header, 0xC)

        # Start sector for additional files such as images, fonts, resources, etc
        # Can be missing so fall back to default set previously
        (ofst,) = struct.unpack_from(b">L", self.header, 0x6C)
        if ofst != 0xFFFFFFFF:
            self.firstresource = ofst + self.start
        (ofst,) = struct.unpack_from(b">L", self.header, 0x50)
        if ofst != 0xFFFFFFFF:
            self.firstnontext = ofst + self.start

        if self.isPrintReplica():
            return

        if self.version < 8:
            # Dictionary metaOrthIndex
            (self.metaOrthIndex,) = struct.unpack_from(b">L", self.header, 0x28)
            if self.metaOrthIndex != 0xFFFFFFFF:
                self.metaOrthIndex += self.start

            # Dictionary metaInflIndex
            (self.metaInflIndex,) = struct.unpack_from(b">L", self.header, 0x2C)
            if self.metaInflIndex != 0xFFFFFFFF:
                self.metaInflIndex += self.start

        # handle older headers without any ncxindex info and later
        # specifically 0xe4 headers
        if self.length + 16 < 0xF8:
            return

        # NCX Index
        (self.ncxidx,) = struct.unpack(b">L", self.header[0xF4:0xF8])
        if self.ncxidx != 0xFFFFFFFF:
            self.ncxidx += self.start

        # K8 specific Indexes
        if self.start != 0 or self.version == 8:
            # Index into <xml> file skeletons in RawML
            (self.skelidx,) = struct.unpack_from(b">L", self.header, 0xFC)
            if self.skelidx != 0xFFFFFFFF:
                self.skelidx += self.start

            # Index into <div> sections in RawML
            (self.fragidx,) = struct.unpack_from(b">L", self.header, 0xF8)
            if self.fragidx != 0xFFFFFFFF:
                self.fragidx += self.start

            # Index into Other files
            (self.guideidx,) = struct.unpack_from(b">L", self.header, 0x104)
            if self.guideidx != 0xFFFFFFFF:
                self.guideidx += self.start

            # dictionaries do not seem to use the same approach in K8's
            # so disable them
            self.metaOrthIndex = 0xFFFFFFFF
            self.metaInflIndex = 0xFFFFFFFF

            # need to use the FDST record to find out how to properly unpack
            # the rawML into pieces
            # it is simply a table of start and end locations for each flow piece
            (self.fdst,) = struct.unpack_from(b">L", self.header, 0xC0)
            (self.fdstcnt,) = struct.unpack_from(b">L", self.header, 0xC4)

            # if cnt is 1 or less, fdst section mumber can be garbage
            if self.fdstcnt <= 1:
                self.fdst = 0xFFFFFFFF
            if self.fdst != 0xFFFFFFFF:
                self.fdst += self.start
                # setting of fdst section description properly handled in mobi_kf8proc


    def dump_exth(self):
        # determine text encoding
        codec = self.codec
        if (not self.hasExth) or (self.exth_length) == 0 or (self.exth == b""):
            return
        (num_items,) = struct.unpack(b">L", self.exth[8:12])
        pos = 12
        logger.debug("Key Size Decription                     Value")
        for _ in range(num_items):
            id, size = struct.unpack(b">LL", self.exth[pos : pos + 8])
            contentsize = size - 8
            content = self.exth[pos + 8 : pos + size]
            if id in MobiHeader.id_map_strings:
                exth_name = MobiHeader.id_map_strings[id]
                logger.debug(
                    "{0: >3d} {1: >4d} {2: <30s} {3:s}".format(
                        id,
                        contentsize,
                        exth_name,
                        content.decode(codec, errors="replace"),
                    )
                )
            elif id in MobiHeader.id_map_values:
                exth_name = MobiHeader.id_map_values[id]
                if size == 9:
                    (value,) = struct.unpack(b"B", content)
                    logger.debug(
                        "{0:3d} byte {1:<30s} {2:d}".format(id, exth_name, value)
                    )
                elif size == 10:
                    (value,) = struct.unpack(b">H", content)
                    logger.debug(
                        "{0:3d} word {1:<30s} 0x{2:0>4X} ({2:d})".format(
                            id, exth_name, value
                        )
                    )
                elif size == 12:
                    (value,) = struct.unpack(b">L", content)
                    logger.debug(
                        "{0:3d} long {1:<30s} 0x{2:0>8X} ({2:d})".format(
                            id, exth_name, value
                        )
                    )
                else:
                    logger.debug(
                        "{0: >3d} {1: >4d} {2: <30s} (0x{3:s})".format(
                            id,
                            contentsize,
                            "Bad size for " + exth_name,
                            hexlify(content),
                        )
                    )
            elif id in MobiHeader.id_map_hexstrings:
                exth_name = MobiHeader.id_map_hexstrings[id]
                logger.debug(
                    "{0:3d} {1:4d} {2:<30s} 0x{3:s}".format(
                        id, contentsize, exth_name, hexlify(content)
                    )
                )
            else:
                exth_name = "Unknown EXTH ID {0:d}".format(id)
                logger.debug(
                    "{0: >3d} {1: >4d} {2: <30s} 0x{3:s}".format(
                        id, contentsize, exth_name, hexlify(content)
                    )
                )
            pos += size
        return

    def dumpheader(self):
        # first 16 bytes are not part of the official mobiheader
        # but we will treat it as such
        # so section 0 is 16 (decimal) + self.length in total == at least 0x108 bytes for Mobi 8 headers
        logger.debug(
            "Dumping section %d, Mobipocket Header version: %d, total length %d"
            % (self.start, self.version, self.length + 16)
        )
        self.hdr = {}
        # set it up for the proper header version
        if self.version == 0:
            self.mobi_header = MobiHeader.palmdoc_header
            self.mobi_header_sorted_keys = MobiHeader.palmdoc_header_sorted_keys
        elif self.version < 8:
            self.mobi_header = MobiHeader.mobi6_header
            self.mobi_header_sorted_keys = MobiHeader.mobi6_header_sorted_keys
        else:
            self.mobi_header = MobiHeader.mobi8_header
            self.mobi_header_sorted_keys = MobiHeader.mobi8_header_sorted_keys

        # parse the header information
        for key in self.mobi_header_sorted_keys:
            (pos, format, tot_len) = self.mobi_header[key]
            if pos < (self.length + 16):
                (val,) = struct.unpack_from(format, self.header, pos)
                self.hdr[key] = val

        if "title_offset" in self.hdr:
            title_offset = self.hdr["title_offset"]
            title_length = self.hdr["title_length"]
        else:
            title_offset = 0
            title_length = 0
        if title_offset == 0:
            title_offset = len(self.header)
            title_length = 0
            self.title = self.sect.palmname.decode("latin-1", errors="replace")
        else:
            self.title = self.header[title_offset : title_offset + title_length].decode(
                self.codec, errors="replace"
            )
            # title record always padded with two nul bytes and then padded with nuls to next 4 byte boundary
            title_length = ((title_length + 2 + 3) >> 2) << 2

        self.extra1 = self.header[self.exth_offset + self.exth_length : title_offset]
        self.extra2 = self.header[title_offset + title_length :]

        logger.debug("Mobipocket header from section %d" % self.start)
        logger.debug("     Offset  Value Hex Dec        Description")
        for key in self.mobi_header_sorted_keys:
            (pos, format, tot_len) = self.mobi_header[key]
            if pos < (self.length + 16):
                if key != "magic":
                    fmt_string = (
                        "0x{0:0>3X} ({0:3d}){1: >"
                        + str(9 - 2 * tot_len)
                        + "s}0x{2:0>"
                        + str(2 * tot_len)
                        + "X} {2:10d} {3:s}"
                    )
                else:
                    self.hdr[key] = unicode_str(self.hdr[key])
                    fmt_string = "0x{0:0>3X} ({0:3d}){2:>11s}            {3:s}"
                logger.debug(fmt_string.format(pos, " ", self.hdr[key], key))
        logger.debug("")

        if self.exth_length > 0:
            logger.debug(
                "EXTH metadata, offset %d, padded length %d"
                % (self.exth_offset, self.exth_length)
            )
            self.dump_exth()
            logger.debug("")

        if len(self.extra1) > 0:
            logger.debug(
                "Extra data between EXTH and Title, length %d" % len(self.extra1)
            )
            logger.debug(hexlify(self.extra1))
            logger.debug("")

        if title_length > 0:
            logger.debug(
                "Title in header at offset %d, padded length %d: '%s'"
                % (title_offset, title_length, self.title)
            )
            logger.debug("")

        if len(self.extra2) > 0:
            print(
                "Extra data between Title and end of header, length %d"
                % len(self.extra2)
            )
            #print(hexlify(self.extra2))
            print("")

    def isPrintReplica(self):
        return self.mlstart[0:4] == b"%MOP"

    def isK8(self):
        return self.start != 0 or self.version == 8

    def isEncrypted(self):
        return self.crypto_type != 0

    def hasNCX(self):
        return self.ncxidx != 0xFFFFFFFF

    def isDictionary(self):
        return self.metaOrthIndex != 0xFFFFFFFF

    def getncxIndex(self):
        return self.ncxidx

    def decompress(self, data):
        return self.unpack(data)

    def Language(self):
        langcode = struct.unpack(b"!L", self.header[0x5C:0x60])[0]
        langid = langcode & 0xFF
        sublangid = (langcode >> 8) & 0xFF
        return getLanguage(langid, sublangid)

    def DictInLanguage(self):
        if self.isDictionary():
            langcode = struct.unpack(b"!L", self.header[0x60:0x64])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            if langid != 0:
                return getLanguage(langid, sublangid)
        return False

    def DictOutLanguage(self):
        if self.isDictionary():
            langcode = struct.unpack(b"!L", self.header[0x64:0x68])[0]
            langid = langcode & 0xFF
            sublangid = (langcode >> 10) & 0xFF
            if langid != 0:
                return getLanguage(langid, sublangid)
        return False

    def getRawML(self):
        def getSizeOfTrailingDataEntry(data):
            num = 0
            for v in data[-4:]:
                if bord(v) & 0x80:
                    num = 0
                num = (num << 7) | (bord(v) & 0x7F)
            return num

        def trimTrailingDataEntries(data):
            for _ in range(trailers):
                num = getSizeOfTrailingDataEntry(data)
                data = data[:-num]
            if multibyte:
                num = (ord(data[-1:]) & 3) + 1
                data = data[:-num]
            return data

        multibyte = 0
        trailers = 0
        if self.sect.ident == b"BOOKMOBI":
            (mobi_length,) = struct.unpack_from(b">L", self.header, 0x14)
            (mobi_version,) = struct.unpack_from(b">L", self.header, 0x68)
            if (mobi_length >= 0xE4) and (mobi_version >= 5):
                (flags,) = struct.unpack_from(b">H", self.header, 0xF2)
                multibyte = flags & 1
                while flags > 1:
                    if flags & 2:
                        trailers += 1
                    flags = flags >> 1
        # get raw mobi markup languge
        logger.debug("Unpacking raw markup language")
        dataList = []
        # offset = 0
        for i in range(1, self.records + 1):
            data = trimTrailingDataEntries(self.sect.loadSection(self.start + i))
            dataList.append(self.unpack(data))
            if self.isK8():
                self.sect.setsectiondescription(
                    self.start + i, "KF8 Text Section {0:d}".format(i)
                )
            elif self.version == 0:
                self.sect.setsectiondescription(
                    self.start + i, "PalmDOC Text Section {0:d}".format(i)
                )
            else:
                self.sect.setsectiondescription(
                    self.start + i, "Mobipocket Text Section {0:d}".format(i)
                )
        rawML = b"".join(dataList)
        self.rawSize = len(rawML)
        return rawML

    # all metadata is stored in a dictionary with key and returns a *list* of values
    # a list is used to allow for multiple creators, multiple contributors, etc
    def parseMetaData(self):
        def addValue(name, value):
            if name not in self.metadata:
                self.metadata[name] = [value]
            else:
                self.metadata[name].append(value)

        codec = self.codec
        if self.hasExth:
            extheader = self.exth
            _length, num_items = struct.unpack(b">LL", extheader[4:12])
            extheader = extheader[12:]
            pos = 0
            for _ in range(num_items):
                id, size = struct.unpack(b">LL", extheader[pos : pos + 8])
                content = extheader[pos + 8 : pos + size]
                if id in MobiHeader.id_map_strings:
                    name = MobiHeader.id_map_strings[id]
                    addValue(name, content.decode(codec, errors="replace"))
                elif id in MobiHeader.id_map_values:
                    name = MobiHeader.id_map_values[id]
                    if size == 9:
                        (value,) = struct.unpack(b"B", content)
                        addValue(name, unicode_str(str(value)))
                    elif size == 10:
                        (value,) = struct.unpack(b">H", content)
                        addValue(name, unicode_str(str(value)))
                    elif size == 12:
                        (value,) = struct.unpack(b">L", content)
                        # handle special case of missing CoverOffset or missing ThumbOffset
                        if id == 201 or id == 202:
                            if value != 0xFFFFFFFF:
                                addValue(name, unicode_str(str(value)))
                        else:
                            addValue(name, unicode_str(str(value)))
                    else:
                        print(
                            "Warning: Bad key, size, value combination detected in EXTH ",
                            id,
                            size,
                            hexlify(content),
                        )
                        addValue(name, hexlify(content))
                elif id in MobiHeader.id_map_hexstrings:
                    name = MobiHeader.id_map_hexstrings[id]
                    addValue(name, hexlify(content))
                else:
                    name = unicode_str(str(id)) + " (hex)"
                    addValue(name, hexlify(content))
                pos += size

        # add the basics to the metadata each as a list element
        self.metadata["Language"] = [self.Language()]
        self.metadata["Title"] = [unicode_str(self.title, self.codec)]
        self.metadata["Codec"] = [self.codec]
        self.metadata["UniqueID"] = [unicode_str(str(self.unique_id))]
        # if no asin create one using a uuid
        if "ASIN" not in self.metadata:
            self.metadata["ASIN"] = [unicode_str(str(uuid.uuid4()))]
        # if no cdeType set it to "EBOK"
        if "cdeType" not in self.metadata:
            self.metadata["cdeType"] = ["EBOK"]

    def getMetaData(self):
        return self.metadata

    def describeHeader(self, DUMP):
        logger.debug("Mobi Version: %s" % self.version)
        logger.debug("Codec: %s" % self.codec)
        logger.debug("Title: %s" % self.title)

        logger.debug("dumpheader : %s" % self.dumpheader())

        if "Updated_Title" in self.metadata:
            logger.debug("EXTH Title: %s" % self.metadata["Updated_Title"][0])
        if self.compression == 0x4448:
            logger.debug("Huffdic compression")
        elif self.compression == 2:
            logger.debug("Palmdoc compression")
        elif self.compression == 1:
            logger.debug("No compression")
        if DUMP:
            self.dumpheader()