kindle manager

2021-08-25 17:58:31 +08:00
parent 5f0c0a9724
commit 6b3c0f3b6b
303 changed files with 87829 additions and 42537 deletions
--- a/mobimaster/mobi/mobi_index.py
+++ b/mobimaster/mobi/mobi_index.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bchr, bstr, bord
+from loguru import logger
+
+if PY2:
+    range = xrange
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .mobi_utils import toHex
+
+
+class MobiIndex:
+    # CGDBG
+    def __init__(self, sect, DEBUG=True):
+        self.sect = sect
+        self.DEBUG = DEBUG
+
+    def getIndexData(self, idx, label="Unknown"):
+        sect = self.sect
+        outtbl = []
+        ctoc_text = {}
+        if idx != 0xFFFFFFFF:
+            sect.setsectiondescription(idx, "{0} Main INDX section".format(label))
+            data = sect.loadSection(idx)
+            idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
+            IndexCount = idxhdr["count"]
+            # handle the case of multiple sections used for CTOC
+            rec_off = 0
+            off = idx + IndexCount + 1
+            for j in range(idxhdr["nctoc"]):
+                cdata = sect.loadSection(off + j)
+                sect.setsectiondescription(off + j, label + " CTOC Data " + str(j))
+                ctocdict = self.readCTOC(cdata)
+                for k in ctocdict:
+                    ctoc_text[k + rec_off] = ctocdict[k]
+                rec_off += 0x10000
+            tagSectionStart = idxhdr["len"]
+            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
+            if self.DEBUG:
+                logger.debug("ControlByteCount is", controlByteCount)
+                logger.debug("IndexCount is", IndexCount)
+                logger.debug("TagTable: %s" % tagTable)
+            for i in range(idx + 1, idx + 1 + IndexCount):
+                sect.setsectiondescription(
+                    i, "{0} Extra {1:d} INDX section".format(label, i - idx)
+                )
+                data = sect.loadSection(i)
+                hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
+                idxtPos = hdrinfo["start"]
+                entryCount = hdrinfo["count"]
+                if self.DEBUG:
+                    logger.debug("%s %s" % (idxtPos, entryCount))
+                # loop through to build up the IDXT position starts
+                idxPositions = []
+                for j in range(entryCount):
+                    (pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
+                    idxPositions.append(pos)
+                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
+                idxPositions.append(idxtPos)
+                # for each entry in the IDXT build up the tagMap and any associated text
+                for j in range(entryCount):
+                    startPos = idxPositions[j]
+                    endPos = idxPositions[j + 1]
+                    textLength = ord(data[startPos : startPos + 1])
+                    text = data[startPos + 1 : startPos + 1 + textLength]
+                    if hordt2 is not None:
+                        text = b"".join(bchr(hordt2[bord(x)]) for x in text)
+                    tagMap = getTagMap(
+                        controlByteCount,
+                        tagTable,
+                        data,
+                        startPos + 1 + textLength,
+                        endPos,
+                    )
+                    outtbl.append([text, tagMap])
+                    if self.DEBUG:
+                        # CGDBG
+                        logger.debug('tagMap {}'.format(tagMap))
+                        logger.debug('text {}'.format(text))
+                        logger.debug('data {}'.format(data))
+
+        return outtbl, ctoc_text
+
+    def parseINDXHeader(self, data):
+        "read INDX header"
+        if not data[:4] == b"INDX":
+            logger.debug("Warning: index section is not INDX")
+            return False
+        words = (
+            "len",
+            "nul1",
+            "type",
+            "gen",
+            "start",
+            "count",
+            "code",
+            "lng",
+            "total",
+            "ordt",
+            "ligt",
+            "nligt",
+            "nctoc",
+        )
+        num = len(words)
+        values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
+        header = {}
+        for n in range(num):
+            header[words[n]] = values[n]
+
+        ordt1 = None
+        ordt2 = None
+
+        ocnt, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
+        if header["code"] == 0xFDEA or ocnt != 0 or oentries > 0:
+            # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
+            # them in the proper place in the header.  They seem to be codepage 65002 which seems
+            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
+
+            # so we need to look for them and store them away to process leading text
+            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
+            # we only ever seem to use the seocnd but ...
+            assert ocnt == 1
+            assert data[op1 : op1 + 4] == b"ORDT"
+            assert data[op2 : op2 + 4] == b"ORDT"
+            ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
+            ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)
+
+        if self.DEBUG:
+            logger.debug("parsed INDX header:")
+            for n in words:
+                print(
+                    n, "%X" % header[n],
+                )
+            logger.debug("")
+        return header, ordt1, ordt2
+
+    def readCTOC(self, txtdata):
+        # read all blocks from CTOC
+        ctoc_data = {}
+        offset = 0
+        while offset < len(txtdata):
+            if PY2:
+                if txtdata[offset] == b"\0":
+                    break
+            else:
+                if txtdata[offset] == 0:
+                    break
+            idx_offs = offset
+            # first n bytes: name len as vwi
+            pos, ilen = getVariableWidthValue(txtdata, offset)
+            offset += pos
+            # <len> next bytes: name
+            name = txtdata[offset : offset + ilen]
+            offset += ilen
+            if self.DEBUG:
+                logger.debug("name length is %s" % ilen)
+                logger.debug("%s %s", (idx_offs, name))
+            ctoc_data[idx_offs] = name
+        return ctoc_data
+
+
+def getVariableWidthValue(data, offset):
+    """
+    Decode variable width value from given bytes.
+
+    @param data: The bytes to decode.
+    @param offset: The start offset into data.
+    @return: Tuple of consumed bytes count and decoded value.
+    """
+    value = 0
+    consumed = 0
+    finished = False
+    while not finished:
+        v = data[offset + consumed : offset + consumed + 1]
+        consumed += 1
+        if ord(v) & 0x80:
+            finished = True
+        value = (value << 7) | (ord(v) & 0x7F)
+    return consumed, value
+
+
+def readTagSection(start, data):
+    """
+    Read tag section from given data.
+
+    @param start: The start position in the data.
+    @param data: The data to process.
+    @return: Tuple of control byte count and list of tag tuples.
+    """
+    controlByteCount = 0
+    tags = []
+    if data[start : start + 4] == b"TAGX":
+        (firstEntryOffset,) = struct.unpack_from(b">L", data, start + 0x04)
+        (controlByteCount,) = struct.unpack_from(b">L", data, start + 0x08)
+
+        # Skip the first 12 bytes already read above.
+        for i in range(12, firstEntryOffset, 4):
+            pos = start + i
+            tags.append(
+                (
+                    ord(data[pos : pos + 1]),
+                    ord(data[pos + 1 : pos + 2]),
+                    ord(data[pos + 2 : pos + 3]),
+                    ord(data[pos + 3 : pos + 4]),
+                )
+            )
+    return controlByteCount, tags
+
+
+def countSetBits(value, bits=8):
+    """
+    Count the set bits in the given value.
+
+    @param value: Integer value.
+    @param bits: The number of bits of the input value (defaults to 8).
+    @return: Number of set bits.
+    """
+    count = 0
+    for _ in range(bits):
+        if value & 0x01 == 0x01:
+            count += 1
+        value = value >> 1
+    return count
+
+
+def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
+    """
+    Create a map of tags and values from the given byte section.
+
+    @param controlByteCount: The number of control bytes.
+    @param tagTable: The tag table.
+    @param entryData: The data to process.
+    @param startPos: The starting position in entryData.
+    @param endPos: The end position in entryData or None if it is unknown.
+    @return: Hashmap of tag and list of values.
+    """
+    tags = []
+    tagHashMap = {}
+    controlByteIndex = 0
+    dataStart = startPos + controlByteCount
+
+    for tag, valuesPerEntry, mask, endFlag in tagTable:
+        if endFlag == 0x01:
+            controlByteIndex += 1
+            continue
+        cbyte = ord(
+            entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
+        )
+        if 0:
+            logger.debug(
+                "Control Byte Index %0x , Control Byte Value %0x"
+                % (controlByteIndex, cbyte)
+            )
+
+        value = (
+            ord(
+                entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
+            )
+            & mask
+        )
+        if value != 0:
+            if value == mask:
+                if countSetBits(mask) > 1:
+                    # If all bits of masked value are set and the mask has more than one bit, a variable width value
+                    # will follow after the control bytes which defines the length of bytes (NOT the value count!)
+                    # which will contain the corresponding variable width values.
+                    consumed, value = getVariableWidthValue(entryData, dataStart)
+                    dataStart += consumed
+                    tags.append((tag, None, value, valuesPerEntry))
+                else:
+                    tags.append((tag, 1, None, valuesPerEntry))
+            else:
+                # Shift bits to get the masked value.
+                while mask & 0x01 == 0:
+                    mask = mask >> 1
+                    value = value >> 1
+                tags.append((tag, value, None, valuesPerEntry))
+    for tag, valueCount, valueBytes, valuesPerEntry in tags:
+        values = []
+        if valueCount is not None:
+            # Read valueCount * valuesPerEntry variable width values.
+            for _ in range(valueCount):
+                for _ in range(valuesPerEntry):
+                    consumed, data = getVariableWidthValue(entryData, dataStart)
+                    dataStart += consumed
+                    values.append(data)
+        else:
+            # Convert valueBytes to variable width values.
+            totalConsumed = 0
+            while totalConsumed < valueBytes:
+                # Does this work for valuesPerEntry != 1?
+                consumed, data = getVariableWidthValue(entryData, dataStart)
+                dataStart += consumed
+                totalConsumed += consumed
+                values.append(data)
+            if totalConsumed != valueBytes:
+                logger.debug(
+                    "Error: Should consume %s bytes, but consumed %s"
+                    % (valueBytes, totalConsumed)
+                )
+        tagHashMap[tag] = values
+    # Test that all bytes have been processed if endPos is given.
+    if endPos is not None and dataStart != endPos:
+        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
+        for char in entryData[dataStart:endPos]:
+            if bord(char) != 0:
+                logger.debug(
+                    "Warning: There are unprocessed index bytes left: %s"
+                    % toHex(entryData[dataStart:endPos])
+                )
+                if 0:
+                    logger.debug("controlByteCount: %s" % controlByteCount)
+                    logger.debug("tagTable: %s" % tagTable)
+                    logger.debug("data: %s" % toHex(entryData[startPos:endPos]))
+                    logger.debug("tagHashMap: %s" % tagHashMap)
+                break
+
+    return tagHashMap