kindle manager

2024-04-03 15:08:22 +08:00
parent 6b3c0f3b6b
commit 6df3ce42a3
459 changed files with 164651 additions and 4690 deletions
--- a/mobiparse/mobi/mobi_dict.py
+++ b/mobiparse/mobi/mobi_dict.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
+from loguru import logger
+
+if PY2:
+    range = xrange
+    array_format = b"B"
+if PY3:
+    unichr = chr
+    array_format = "B"
+
+import array
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
+from .mobi_utils import toHex
+
+DEBUG_DICT = True
+
+class InflectionData(object):
+    def __init__(self, infldatas):
+        self.infldatas = infldatas
+        self.starts = []
+        self.counts = []
+        for idata in self.infldatas:
+            (start,) = struct.unpack_from(b">L", idata, 0x14)
+            (count,) = struct.unpack_from(b">L", idata, 0x18)
+            self.starts.append(start)
+            self.counts.append(count)
+
+    def lookup(self, lookupvalue):
+        i = 0
+        rvalue = lookupvalue
+        while rvalue >= self.counts[i]:
+            rvalue = rvalue - self.counts[i]
+            i += 1
+            if i == len(self.counts):
+                logger.debug("Error: Problem with multiple inflections data sections")
+                return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
+        return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
+
+    def offsets(self, value):
+        rvalue, start, count, data = self.lookup(value)
+        (offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue))
+        if rvalue + 1 < count:
+            (nextOffset,) = struct.unpack_from(
+                b">H", data, start + 4 + (2 * (rvalue + 1))
+            )
+        else:
+            nextOffset = None
+        return offset, nextOffset, data
+
+
+class dictSupport(object):
+    def __init__(self, mh, sect):
+        self.mh = mh
+        self.header = mh.header
+        self.sect = sect
+        self.metaOrthIndex = mh.metaOrthIndex
+        self.metaInflIndex = mh.metaInflIndex
+
+    def parseHeader(self, data):
+        "read INDX header"
+        if not data[:4] == b"INDX":
+            logger.debug("Warning: index section is not INDX")
+            return False
+        words = (
+            "len",
+            "nul1",
+            "type",
+            "gen",
+            "start",
+            "count",
+            "code",
+            "lng",
+            "total",
+            "ordt",
+            "ligt",
+            "nligt",
+            "nctoc",
+        )
+        num = len(words)
+        values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
+        header = {}
+        for n in range(num):
+            header[words[n]] = values[n]
+
+        ordt1 = None
+        ordt2 = None
+
+        otype, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
+        header["otype"] = otype
+        header["oentries"] = oentries
+
+        if DEBUG_DICT:
+            logger.debug(
+                "otype %d, oentries %d, op1 %d, op2 %d, otagx %d"
+                % (otype, oentries, op1, op2, otagx)
+            )
+
+        if header["code"] == 0xFDEA or oentries > 0:
+            # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
+            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
+            # So we need to look for them and store them away to process leading text
+            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
+            # we only ever seem to use the second but ...
+            #
+            # if otype = 0, ORDT table uses 16 bit values as offsets into the table
+            # if otype = 1, ORDT table uses 8 bit values as offsets inot the table
+
+            assert data[op1 : op1 + 4] == b"ORDT"
+            assert data[op2 : op2 + 4] == b"ORDT"
+            ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
+            ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)
+
+        if DEBUG_DICT:
+            logger.debug("parsed INDX header:")
+            for key in header:
+                logger.debug(
+                    key, "%x" % header[key],
+                )
+            logger.debug("\n")
+        return header, ordt1, ordt2
+
+    def getPositionMap(self):
+        sect = self.sect
+
+        positionMap = {}
+
+        metaOrthIndex = self.metaOrthIndex
+        metaInflIndex = self.metaInflIndex
+
+        decodeInflection = True
+        if metaOrthIndex != 0xFFFFFFFF:
+            logger.debug(
+                "Info: Document contains orthographic index, handle as dictionary"
+            )
+            if metaInflIndex == 0xFFFFFFFF:
+                decodeInflection = False
+            else:
+                metaInflIndexData = sect.loadSection(metaInflIndex)
+
+                logger.debug("\nParsing metaInflIndexData")
+                midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
+
+                metaIndexCount = midxhdr["count"]
+                idatas = []
+                for j in range(metaIndexCount):
+                    idatas.append(sect.loadSection(metaInflIndex + 1 + j))
+                dinfl = InflectionData(idatas)
+
+                inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
+                tagSectionStart = midxhdr["len"]
+                inflectionControlByteCount, inflectionTagTable = readTagSection(
+                    tagSectionStart, metaInflIndexData
+                )
+                if DEBUG_DICT:
+                    logger.debug("inflectionTagTable: %s" % inflectionTagTable)
+                if self.hasTag(inflectionTagTable, 0x07):
+                    logger.debug(
+                        "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
+                    )
+                    decodeInflection = False
+
+            data = sect.loadSection(metaOrthIndex)
+
+            logger.debug("\nParsing metaOrthIndex")
+            idxhdr, hordt1, hordt2 = self.parseHeader(data)
+
+            tagSectionStart = idxhdr["len"]
+            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
+            orthIndexCount = idxhdr["count"]
+            logger.debug("orthIndexCount is", orthIndexCount)
+            if DEBUG_DICT:
+                logger.debug("orthTagTable: %s" % tagTable)
+            if hordt2 is not None:
+                logger.debug(
+                    "orth entry uses ordt2 lookup table of type ", idxhdr["otype"]
+                )
+            hasEntryLength = self.hasTag(tagTable, 0x02)
+            if not hasEntryLength:
+                logger.debug("Info: Index doesn't contain entry length tags")
+
+            logger.debug("Read dictionary index data")
+            for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
+                data = sect.loadSection(i)
+                hdrinfo, ordt1, ordt2 = self.parseHeader(data)
+                idxtPos = hdrinfo["start"]
+                entryCount = hdrinfo["count"]
+                idxPositions = []
+                for j in range(entryCount):
+                    (pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
+                    idxPositions.append(pos)
+                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
+                idxPositions.append(idxtPos)
+                for j in range(entryCount):
+                    startPos = idxPositions[j]
+                    endPos = idxPositions[j + 1]
+                    textLength = ord(data[startPos : startPos + 1])
+                    text = data[startPos + 1 : startPos + 1 + textLength]
+                    if hordt2 is not None:
+                        utext = ""
+                        if idxhdr["otype"] == 0:
+                            pattern = b">H"
+                            inc = 2
+                        else:
+                            pattern = b">B"
+                            inc = 1
+                        pos = 0
+                        while pos < textLength:
+                            (off,) = struct.unpack_from(pattern, text, pos)
+                            if off < len(hordt2):
+                                utext += unichr(hordt2[off])
+                            else:
+                                utext += unichr(off)
+                            pos += inc
+                        text = utext.encode("utf-8")
+
+                    tagMap = getTagMap(
+                        controlByteCount,
+                        tagTable,
+                        data,
+                        startPos + 1 + textLength,
+                        endPos,
+                    )
+                    if 0x01 in tagMap:
+                        if decodeInflection and 0x2A in tagMap:
+                            inflectionGroups = self.getInflectionGroups(
+                                text,
+                                inflectionControlByteCount,
+                                inflectionTagTable,
+                                dinfl,
+                                inflNameData,
+                                tagMap[0x2A],
+                            )
+                        else:
+                            inflectionGroups = b""
+                        assert len(tagMap[0x01]) == 1
+                        entryStartPosition = tagMap[0x01][0]
+                        if hasEntryLength:
+                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
+                            ml = (
+                                b'<idx:entry scriptable="yes"><idx:orth value="'
+                                + text
+                                + b'">'
+                                + inflectionGroups
+                                + b"</idx:orth>"
+                            )
+                            if entryStartPosition in positionMap:
+                                positionMap[entryStartPosition] = (
+                                    positionMap[entryStartPosition] + ml
+                                )
+                            else:
+                                positionMap[entryStartPosition] = ml
+                            assert len(tagMap[0x02]) == 1
+                            entryEndPosition = entryStartPosition + tagMap[0x02][0]
+                            if entryEndPosition in positionMap:
+                                positionMap[entryEndPosition] = (
+                                    b"</idx:entry>" + positionMap[entryEndPosition]
+                                )
+                            else:
+                                positionMap[entryEndPosition] = b"</idx:entry>"
+
+                        else:
+                            indexTags = (
+                                b'<idx:entry>\n<idx:orth value="'
+                                + text
+                                + b'">\n'
+                                + inflectionGroups
+                                + b"</idx:entry>\n"
+                            )
+                            if entryStartPosition in positionMap:
+                                positionMap[entryStartPosition] = (
+                                    positionMap[entryStartPosition] + indexTags
+                                )
+                            else:
+                                positionMap[entryStartPosition] = indexTags
+        return positionMap
+
+    def hasTag(self, tagTable, tag):
+        """
+        Test if tag table contains given tag.
+
+        @param tagTable: The tag table.
+        @param tag: The tag to search.
+        @return: True if tag table contains given tag; False otherwise.
+        """
+        for currentTag, _, _, _ in tagTable:
+            if currentTag == tag:
+                return True
+        return False
+
+    def getInflectionGroups(
+        self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList
+    ):
+        """
+        Create string which contains the inflection groups with inflection rules as mobipocket tags.
+
+        @param mainEntry: The word to inflect.
+        @param controlByteCount: The number of control bytes.
+        @param tagTable: The tag table.
+        @param data: The Inflection data object to properly select the right inflection data section to use
+        @param inflectionNames: The inflection rule name data.
+        @param groupList: The list of inflection groups to process.
+        @return: String with inflection groups and rules or empty string if required tags are not available.
+        """
+        result = b""
+        for value in groupList:
+            offset, nextOffset, data = dinfl.offsets(value)
+
+            # First byte seems to be always 0x00 and must be skipped.
+            assert ord(data[offset : offset + 1]) == 0x00
+            tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
+
+            # Make sure that the required tags are available.
+            if 0x05 not in tagMap:
+                logger.debug("Error: Required tag 0x05 not found in tagMap")
+                return ""
+            if 0x1A not in tagMap:
+                logger.debug("Error: Required tag 0x1a not found in tagMap")
+                return b""
+
+            result += b"<idx:infl>"
+
+            for i in range(len(tagMap[0x05])):
+
+                # Get name of inflection rule.
+                value = tagMap[0x05][i]
+                consumed, textLength = getVariableWidthValue(inflectionNames, value)
+                inflectionName = inflectionNames[
+                    value + consumed : value + consumed + textLength
+                ]
+
+                # Get and apply inflection rule across possibly multiple inflection data sections
+                value = tagMap[0x1A][i]
+                rvalue, start, count, data = dinfl.lookup(value)
+                (offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue))
+                textLength = ord(data[offset : offset + 1])
+                inflection = self.applyInflectionRule(
+                    mainEntry, data, offset + 1, offset + 1 + textLength
+                )
+                if inflection is not None:
+                    result += (
+                        b'  <idx:iform name="'
+                        + inflectionName
+                        + b'" value="'
+                        + inflection
+                        + b'"/>'
+                    )
+
+            result += b"</idx:infl>"
+        return result
+
+    def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
+        """
+        Apply inflection rule.
+
+        @param mainEntry: The word to inflect.
+        @param inflectionRuleData: The inflection rules.
+        @param start: The start position of the inflection rule to use.
+        @param end: The end position of the inflection rule to use.
+        @return: The string with the inflected word or None if an error occurs.
+        """
+        mode = -1
+        byteArray = array.array(array_format, mainEntry)
+        position = len(byteArray)
+        for charOffset in range(start, end):
+            char = inflectionRuleData[charOffset : charOffset + 1]
+            abyte = ord(char)
+            if abyte >= 0x0A and abyte <= 0x13:
+                # Move cursor backwards
+                offset = abyte - 0x0A
+                if mode not in [0x02, 0x03]:
+                    mode = 0x02
+                    position = len(byteArray)
+                position -= offset
+            elif abyte > 0x13:
+                if mode == -1:
+                    logger.debug(
+                        "Error: Unexpected first byte %i of inflection rule" % abyte
+                    )
+                    return None
+                elif position == -1:
+                    logger.debug(
+                        "Error: Unexpected first byte %i of inflection rule" % abyte
+                    )
+                    return None
+                else:
+                    if mode == 0x01:
+                        # Insert at word start
+                        byteArray.insert(position, abyte)
+                        position += 1
+                    elif mode == 0x02:
+                        # Insert at word end
+                        byteArray.insert(position, abyte)
+                    elif mode == 0x03:
+                        # Delete at word end
+                        position -= 1
+                        deleted = byteArray.pop(position)
+                        if bchr(deleted) != char:
+                            if DEBUG_DICT:
+                                logger.debug(
+                                    "0x03: %s %s %s %s"
+                                    % (
+                                        mainEntry,
+                                        toHex(inflectionRuleData[start:end]),
+                                        char,
+                                        bchr(deleted),
+                                    )
+                                )
+                            logger.debug(
+                                "Error: Delete operation of inflection rule failed"
+                            )
+                            return None
+                    elif mode == 0x04:
+                        # Delete at word start
+                        deleted = byteArray.pop(position)
+                        if bchr(deleted) != char:
+                            if DEBUG_DICT:
+                                logger.debug(
+                                    "0x03: %s %s %s %s"
+                                    % (
+                                        mainEntry,
+                                        toHex(inflectionRuleData[start:end]),
+                                        char,
+                                        bchr(deleted),
+                                    )
+                                )
+                            logger.debug(
+                                "Error: Delete operation of inflection rule failed"
+                            )
+                            return None
+                    else:
+                        logger.debug(
+                            "Error: Inflection rule mode %x is not implemented" % mode
+                        )
+                        return None
+            elif abyte == 0x01:
+                # Insert at word start
+                if mode not in [0x01, 0x04]:
+                    position = 0
+                mode = abyte
+            elif abyte == 0x02:
+                # Insert at word end
+                if mode not in [0x02, 0x03]:
+                    position = len(byteArray)
+                mode = abyte
+            elif abyte == 0x03:
+                # Delete at word end
+                if mode not in [0x02, 0x03]:
+                    position = len(byteArray)
+                mode = abyte
+            elif abyte == 0x04:
+                # Delete at word start
+                if mode not in [0x01, 0x04]:
+                    position = 0
+                # Delete at word start
+                mode = abyte
+            else:
+                logger.debug(
+                    "Error: Inflection rule mode %x is not implemented" % abyte
+                )
+                return None
+        return utf8_str(byteArray.tostring())