kman/mobiparse/mobi/mobi_index.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

from __future__ import unicode_literals, division, absolute_import, print_function

from .compatibility_utils import PY2, bchr, bstr, bord
from loguru import logger

if PY2:
    range = xrange

import struct

# note:  struct pack, unpack, unpack_from all require bytestring format
# data all the way up to at least python 2.7.5, python 3 okay with bytestring

from .mobi_utils import toHex


class MobiIndex:
    # CGDBG
    def __init__(self, sect, DEBUG=True):
        self.sect = sect
        self.DEBUG = DEBUG

    def getIndexData(self, idx, label="Unknown"):
        sect = self.sect
        outtbl = []
        ctoc_text = {}
        if idx != 0xFFFFFFFF:
            sect.setsectiondescription(idx, "{0} Main INDX section".format(label))
            data = sect.loadSection(idx)
            idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
            IndexCount = idxhdr["count"]
            # handle the case of multiple sections used for CTOC
            rec_off = 0
            off = idx + IndexCount + 1
            for j in range(idxhdr["nctoc"]):
                cdata = sect.loadSection(off + j)
                sect.setsectiondescription(off + j, label + " CTOC Data " + str(j))
                ctocdict = self.readCTOC(cdata)
                for k in ctocdict:
                    ctoc_text[k + rec_off] = ctocdict[k]
                rec_off += 0x10000
            tagSectionStart = idxhdr["len"]
            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
            if self.DEBUG:
                logger.debug("ControlByteCount is", controlByteCount)
                logger.debug("IndexCount is", IndexCount)
                logger.debug("TagTable: %s" % tagTable)
            for i in range(idx + 1, idx + 1 + IndexCount):
                sect.setsectiondescription(
                    i, "{0} Extra {1:d} INDX section".format(label, i - idx)
                )
                data = sect.loadSection(i)
                hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
                idxtPos = hdrinfo["start"]
                entryCount = hdrinfo["count"]
                if self.DEBUG:
                    logger.debug("%s %s" % (idxtPos, entryCount))
                # loop through to build up the IDXT position starts
                idxPositions = []
                for j in range(entryCount):
                    (pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
                    idxPositions.append(pos)
                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
                idxPositions.append(idxtPos)
                # for each entry in the IDXT build up the tagMap and any associated text
                for j in range(entryCount):
                    startPos = idxPositions[j]
                    endPos = idxPositions[j + 1]
                    textLength = ord(data[startPos : startPos + 1])
                    text = data[startPos + 1 : startPos + 1 + textLength]
                    if hordt2 is not None:
                        text = b"".join(bchr(hordt2[bord(x)]) for x in text)
                    tagMap = getTagMap(
                        controlByteCount,
                        tagTable,
                        data,
                        startPos + 1 + textLength,
                        endPos,
                    )
                    outtbl.append([text, tagMap])
                    if self.DEBUG:
                        # CGDBG
                        logger.debug('tagMap {}'.format(tagMap))
                        logger.debug('text {}'.format(text))
                        logger.debug('data {}'.format(data))

        return outtbl, ctoc_text

    def parseINDXHeader(self, data):
        "read INDX header"
        if not data[:4] == b"INDX":
            logger.debug("Warning: index section is not INDX")
            return False
        words = (
            "len",
            "nul1",
            "type",
            "gen",
            "start",
            "count",
            "code",
            "lng",
            "total",
            "ordt",
            "ligt",
            "nligt",
            "nctoc",
        )
        num = len(words)
        values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
        header = {}
        for n in range(num):
            header[words[n]] = values[n]

        ordt1 = None
        ordt2 = None

        ocnt, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
        if header["code"] == 0xFDEA or ocnt != 0 or oentries > 0:
            # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
            # them in the proper place in the header.  They seem to be codepage 65002 which seems
            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings

            # so we need to look for them and store them away to process leading text
            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
            # we only ever seem to use the seocnd but ...
            assert ocnt == 1
            assert data[op1 : op1 + 4] == b"ORDT"
            assert data[op2 : op2 + 4] == b"ORDT"
            ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
            ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)

        if self.DEBUG:
            logger.debug("parsed INDX header:")
            for n in words:
                print(
                    n, "%X" % header[n],
                )
            logger.debug("")
        return header, ordt1, ordt2

    def readCTOC(self, txtdata):
        # read all blocks from CTOC
        ctoc_data = {}
        offset = 0
        while offset < len(txtdata):
            if PY2:
                if txtdata[offset] == b"\0":
                    break
            else:
                if txtdata[offset] == 0:
                    break
            idx_offs = offset
            # first n bytes: name len as vwi
            pos, ilen = getVariableWidthValue(txtdata, offset)
            offset += pos
            # <len> next bytes: name
            name = txtdata[offset : offset + ilen]
            offset += ilen
            if self.DEBUG:
                logger.debug("name length is %s" % ilen)
                logger.debug("%s %s", (idx_offs, name))
            ctoc_data[idx_offs] = name
        return ctoc_data


def getVariableWidthValue(data, offset):
    """
    Decode variable width value from given bytes.

    @param data: The bytes to decode.
    @param offset: The start offset into data.
    @return: Tuple of consumed bytes count and decoded value.
    """
    value = 0
    consumed = 0
    finished = False
    while not finished:
        v = data[offset + consumed : offset + consumed + 1]
        consumed += 1
        if ord(v) & 0x80:
            finished = True
        value = (value << 7) | (ord(v) & 0x7F)
    return consumed, value


def readTagSection(start, data):
    """
    Read tag section from given data.

    @param start: The start position in the data.
    @param data: The data to process.
    @return: Tuple of control byte count and list of tag tuples.
    """
    controlByteCount = 0
    tags = []
    if data[start : start + 4] == b"TAGX":
        (firstEntryOffset,) = struct.unpack_from(b">L", data, start + 0x04)
        (controlByteCount,) = struct.unpack_from(b">L", data, start + 0x08)

        # Skip the first 12 bytes already read above.
        for i in range(12, firstEntryOffset, 4):
            pos = start + i
            tags.append(
                (
                    ord(data[pos : pos + 1]),
                    ord(data[pos + 1 : pos + 2]),
                    ord(data[pos + 2 : pos + 3]),
                    ord(data[pos + 3 : pos + 4]),
                )
            )
    return controlByteCount, tags


def countSetBits(value, bits=8):
    """
    Count the set bits in the given value.

    @param value: Integer value.
    @param bits: The number of bits of the input value (defaults to 8).
    @return: Number of set bits.
    """
    count = 0
    for _ in range(bits):
        if value & 0x01 == 0x01:
            count += 1
        value = value >> 1
    return count


def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
    """
    Create a map of tags and values from the given byte section.

    @param controlByteCount: The number of control bytes.
    @param tagTable: The tag table.
    @param entryData: The data to process.
    @param startPos: The starting position in entryData.
    @param endPos: The end position in entryData or None if it is unknown.
    @return: Hashmap of tag and list of values.
    """
    tags = []
    tagHashMap = {}
    controlByteIndex = 0
    dataStart = startPos + controlByteCount

    for tag, valuesPerEntry, mask, endFlag in tagTable:
        if endFlag == 0x01:
            controlByteIndex += 1
            continue
        cbyte = ord(
            entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
        )
        if 0:
            logger.debug(
                "Control Byte Index %0x , Control Byte Value %0x"
                % (controlByteIndex, cbyte)
            )

        value = (
            ord(
                entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
            )
            & mask
        )
        if value != 0:
            if value == mask:
                if countSetBits(mask) > 1:
                    # If all bits of masked value are set and the mask has more than one bit, a variable width value
                    # will follow after the control bytes which defines the length of bytes (NOT the value count!)
                    # which will contain the corresponding variable width values.
                    consumed, value = getVariableWidthValue(entryData, dataStart)
                    dataStart += consumed
                    tags.append((tag, None, value, valuesPerEntry))
                else:
                    tags.append((tag, 1, None, valuesPerEntry))
            else:
                # Shift bits to get the masked value.
                while mask & 0x01 == 0:
                    mask = mask >> 1
                    value = value >> 1
                tags.append((tag, value, None, valuesPerEntry))
    for tag, valueCount, valueBytes, valuesPerEntry in tags:
        values = []
        if valueCount is not None:
            # Read valueCount * valuesPerEntry variable width values.
            for _ in range(valueCount):
                for _ in range(valuesPerEntry):
                    consumed, data = getVariableWidthValue(entryData, dataStart)
                    dataStart += consumed
                    values.append(data)
        else:
            # Convert valueBytes to variable width values.
            totalConsumed = 0
            while totalConsumed < valueBytes:
                # Does this work for valuesPerEntry != 1?
                consumed, data = getVariableWidthValue(entryData, dataStart)
                dataStart += consumed
                totalConsumed += consumed
                values.append(data)
            if totalConsumed != valueBytes:
                logger.debug(
                    "Error: Should consume %s bytes, but consumed %s"
                    % (valueBytes, totalConsumed)
                )
        tagHashMap[tag] = values
    # Test that all bytes have been processed if endPos is given.
    if endPos is not None and dataStart != endPos:
        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
        for char in entryData[dataStart:endPos]:
            if bord(char) != 0:
                logger.debug(
                    "Warning: There are unprocessed index bytes left: %s"
                    % toHex(entryData[dataStart:endPos])
                )
                if 0:
                    logger.debug("controlByteCount: %s" % controlByteCount)
                    logger.debug("tagTable: %s" % tagTable)
                    logger.debug("data: %s" % toHex(entryData[startPos:endPos]))
                    logger.debug("tagHashMap: %s" % tagHashMap)
                break

    return tagHashMap