kindle manager

2021-08-25 17:58:31 +08:00
parent 5f0c0a9724
commit 6b3c0f3b6b
303 changed files with 87829 additions and 42537 deletions
--- a/mobimaster/mobi/mobi_k8proc.py
+++ b/mobimaster/mobi/mobi_k8proc.py
@@ -0,0 +1,575 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bstr, utf8_str
+from loguru import logger
+
+if PY2:
+    range = xrange
+
+import os
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+import re
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_index import MobiIndex
+from .mobi_utils import fromBase32
+from .unipath import pathof
+
+_guide_types = [
+    b"cover",
+    b"title-page",
+    b"toc",
+    b"index",
+    b"glossary",
+    b"acknowledgements",
+    b"bibliography",
+    b"colophon",
+    b"copyright-page",
+    b"dedication",
+    b"epigraph",
+    b"foreward",
+    b"loi",
+    b"lot",
+    b"notes",
+    b"preface",
+    b"text",
+]
+
+# locate beginning and ending positions of tag with specific aid attribute
+def locate_beg_end_of_tag(ml, aid):
+    pattern = utf8_str(r"""<[^>]*\said\s*=\s*['"]%s['"][^>]*>""" % aid)
+    aid_pattern = re.compile(pattern, re.IGNORECASE)
+    for m in re.finditer(aid_pattern, ml):
+        plt = m.start()
+        pgt = ml.find(b">", plt + 1)
+        return plt, pgt
+    return 0, 0
+
+
+# iterate over all tags in block in reverse order, i.e. last ta to first tag
+def reverse_tag_iter(block):
+    end = len(block)
+    while True:
+        pgt = block.rfind(b">", 0, end)
+        if pgt == -1:
+            break
+        plt = block.rfind(b"<", 0, pgt)
+        if plt == -1:
+            break
+        yield block[plt : pgt + 1]
+        end = plt
+
+
+class K8Processor:
+    def __init__(self, mh, sect, files, debug=False):
+        self.sect = sect
+        self.files = files
+        self.mi = MobiIndex(sect)
+        self.mh = mh
+        self.skelidx = mh.skelidx
+        self.fragidx = mh.fragidx
+        self.guideidx = mh.guideidx
+        self.fdst = mh.fdst
+        self.flowmap = {}
+        self.flows = None
+        self.flowinfo = []
+        self.parts = None
+        self.partinfo = []
+        self.linked_aids = set()
+        self.fdsttbl = [0, 0xFFFFFFFF]
+        self.DEBUG = debug
+
+        # read in and parse the FDST info which is very similar in format to the Palm DB section
+        # parsing except it provides offsets into rawML file and not the Palm DB file
+        # this is needed to split up the final css, svg, etc flow section
+        # that can exist at the end of the rawML file
+        if self.fdst != 0xFFFFFFFF:
+            header = self.sect.loadSection(self.fdst)
+            if header[0:4] == b"FDST":
+                (num_sections,) = struct.unpack_from(b">L", header, 0x08)
+                self.fdsttbl = struct.unpack_from(
+                    bstr(">%dL" % (num_sections * 2)), header, 12
+                )[::2] + (mh.rawSize,)
+                sect.setsectiondescription(self.fdst, "KF8 FDST INDX")
+                if self.DEBUG:
+                    logger.debug("\nFDST Section Map:  %d sections" % num_sections)
+                    for j in range(num_sections):
+                        logger.debug(
+                            "Section %d: 0x%08X - 0x%08X"
+                            % (j, self.fdsttbl[j], self.fdsttbl[j + 1])
+                        )
+            else:
+                logger.debug("\nError: K8 Mobi with Missing FDST info")
+
+        # read/process skeleton index info to create the skeleton table
+        skeltbl = []
+        if self.skelidx != 0xFFFFFFFF:
+            # for i in range(2):
+            #     fname = 'skel%04d.dat' % i
+            #     data = self.sect.loadSection(self.skelidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
+            fileptr = 0
+            for [text, tagMap] in outtbl:
+                # file number, skeleton name, fragtbl record count, start position, length
+                skeltbl.append(
+                    [fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]
+                )
+                fileptr += 1
+        self.skeltbl = skeltbl
+        if self.DEBUG:
+            logger.debug("\nSkel Table:  %d entries" % len(self.skeltbl))
+            logger.debug(
+                "table: filenum, skeleton name, frag tbl record count, start position, length"
+            )
+            for j in range(len(self.skeltbl)):
+                logger.debug(self.skeltbl[j])
+
+        # read/process the fragment index to create the fragment table
+        fragtbl = []
+        if self.fragidx != 0xFFFFFFFF:
+            # for i in range(3):
+            #     fname = 'frag%04d.dat' % i
+            #     data = self.sect.loadSection(self.fragidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
+            for [text, tagMap] in outtbl:
+                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
+                ctocoffset = tagMap[2][0]
+                ctocdata = ctoc_text[ctocoffset]
+                fragtbl.append(
+                    [
+                        int(text),
+                        ctocdata,
+                        tagMap[3][0],
+                        tagMap[4][0],
+                        tagMap[6][0],
+                        tagMap[6][1],
+                    ]
+                )
+        self.fragtbl = fragtbl
+        if self.DEBUG:
+            logger.debug("\nFragment Table: %d entries" % len(self.fragtbl))
+            logger.debug(
+                "table: file position, link id text, file num, sequence number, start position, length"
+            )
+            for j in range(len(self.fragtbl)):
+                logger.debug(self.fragtbl[j])
+
+        # read / process guide index for guide elements of opf
+        guidetbl = []
+        if self.guideidx != 0xFFFFFFFF:
+            # for i in range(3):
+            #     fname = 'guide%04d.dat' % i
+            #     data = self.sect.loadSection(self.guideidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(
+                self.guideidx, "KF8 Guide elements)"
+            )
+            for [text, tagMap] in outtbl:
+                # ref_type, ref_title, frag number
+                ctocoffset = tagMap[1][0]
+                ref_title = ctoc_text[ctocoffset]
+                ref_type = text
+                fileno = None
+                if 3 in tagMap:
+                    fileno = tagMap[3][0]
+                if 6 in tagMap:
+                    fileno = tagMap[6][0]
+                guidetbl.append([ref_type, ref_title, fileno])
+        self.guidetbl = guidetbl
+        if self.DEBUG:
+            logger.debug("\nGuide Table: %d entries" % len(self.guidetbl))
+            logger.debug("table: ref_type, ref_title, fragtbl entry number")
+            for j in range(len(self.guidetbl)):
+                logger.debug(self.guidetbl[j])
+
+    def buildParts(self, rawML):
+        # now split the rawML into its flow pieces
+        self.flows = []
+        for j in range(0, len(self.fdsttbl) - 1):
+            start = self.fdsttbl[j]
+            end = self.fdsttbl[j + 1]
+            self.flows.append(rawML[start:end])
+
+        # the first piece represents the xhtml text
+        text = self.flows[0]
+        self.flows[0] = b""
+
+        # walk the <skeleton> and fragment tables to build original source xhtml files
+        # *without* destroying any file position information needed for later href processing
+        # and create final list of file separation start: stop points and etc in partinfo
+        if self.DEBUG:
+            logger.debug("\nRebuilding flow piece 0: the main body of the ebook")
+        self.parts = []
+        self.partinfo = []
+        fragptr = 0
+        baseptr = 0
+        cnt = 0
+        filename = "part%04d.xhtml" % cnt
+        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
+            baseptr = skelpos + skellen
+            skeleton = text[skelpos:baseptr]
+            aidtext = "0"
+            for i in range(fragcnt):
+                [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[
+                    fragptr
+                ]
+                aidtext = idtext[12:-2]
+                if i == 0:
+                    filename = "part%04d.xhtml" % filenum
+                slice = text[baseptr : baseptr + length]
+                insertpos = insertpos - skelpos
+                head = skeleton[:insertpos]
+                tail = skeleton[insertpos:]
+                actual_inspos = insertpos
+                if tail.find(b">") < tail.find(b"<") or head.rfind(b">") < head.rfind(
+                    b"<"
+                ):
+                    # There is an incomplete tag in either the head or tail.
+                    # This can happen for some badly formed KF8 files
+                    logger.debug(
+                        "The fragment table for %s has incorrect insert position. Calculating manually."
+                        % skelname
+                    )
+                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
+                    if bp != ep:
+                        actual_inspos = ep + 1 + startpos
+                if insertpos != actual_inspos:
+                    print(
+                        "fixed corrupt fragment table insert position",
+                        insertpos + skelpos,
+                        actual_inspos + skelpos,
+                    )
+                    insertpos = actual_inspos
+                    self.fragtbl[fragptr][0] = actual_inspos + skelpos
+                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
+                baseptr = baseptr + length
+                fragptr += 1
+            cnt += 1
+            self.parts.append(skeleton)
+            self.partinfo.append([skelnum, "Text", filename, skelpos, baseptr, aidtext])
+
+        assembled_text = b"".join(self.parts)
+        if self.DEBUG:
+            outassembled = os.path.join(self.files.k8dir, "assembled_text.dat")
+            with open(pathof(outassembled), "wb") as f:
+                f.write(assembled_text)
+
+        # The primary css style sheet is typically stored next followed by any
+        # snippets of code that were previously inlined in the
+        # original xhtml but have been stripped out and placed here.
+        # This can include local CDATA snippets and and svg sections.
+
+        # The problem is that for most browsers and ereaders, you can not
+        # use <img src="imageXXXX.svg" /> to import any svg image that itself
+        # properly uses an <image/> tag to import some raster image - it
+        # should work according to the spec but does not for almost all browsers
+        # and ereaders and causes epub validation issues because those  raster
+        # images are in manifest but not in xhtml text - since they only
+        # referenced from an svg image
+
+        # So we need to check the remaining flow pieces to see if they are css
+        # or svg images.  if svg images, we must check if they have an <image />
+        # and if so inline them into the xhtml text pieces.
+
+        # there may be other sorts of pieces stored here but until we see one
+        # in the wild to reverse engineer we won't be able to tell
+        self.flowinfo.append([None, None, None, None])
+        svg_tag_pattern = re.compile(br"""(<svg[^>]*>)""", re.IGNORECASE)
+        image_tag_pattern = re.compile(br"""(<image[^>]*>)""", re.IGNORECASE)
+        for j in range(1, len(self.flows)):
+            flowpart = self.flows[j]
+            nstr = "%04d" % j
+            m = re.search(svg_tag_pattern, flowpart)
+            if m is not None:
+                # svg
+                ptype = b"svg"
+                start = m.start()
+                m2 = re.search(image_tag_pattern, flowpart)
+                if m2 is not None:
+                    pformat = b"inline"
+                    pdir = None
+                    fname = None
+                    # strip off anything before <svg if inlining
+                    flowpart = flowpart[start:]
+                else:
+                    pformat = b"file"
+                    pdir = "Images"
+                    fname = "svgimg" + nstr + ".svg"
+            else:
+                # search for CDATA and if exists inline it
+                if flowpart.find(b"[CDATA[") >= 0:
+                    ptype = b"css"
+                    flowpart = b'<style type="text/css">\n' + flowpart + b"\n</style>\n"
+                    pformat = b"inline"
+                    pdir = None
+                    fname = None
+                else:
+                    # css - assume as standalone css file
+                    ptype = b"css"
+                    pformat = b"file"
+                    pdir = "Styles"
+                    fname = "style" + nstr + ".css"
+
+            self.flows[j] = flowpart
+            self.flowinfo.append([ptype, pformat, pdir, fname])
+
+        if self.DEBUG:
+            logger.debug("\nFlow Map:  %d entries" % len(self.flowinfo))
+            for fi in self.flowinfo:
+                logger.debug(fi)
+            logger.debug("\n")
+
+            logger.debug(
+                "\nXHTML File Part Position Information: %d entries"
+                % len(self.partinfo)
+            )
+            for pi in self.partinfo:
+                logger.debug(pi)
+
+        if False:  # self.Debug:
+            # dump all of the locations of the aid tags used in TEXT
+            # find id links only inside of tags
+            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
+            #    [^>]* means match any amount of chars except for  '>' char
+            #    [^'"] match any amount of chars except for the quote character
+            #    \s* means match any amount of whitespace
+            logger.debug("\npositions of all aid= pieces")
+            id_pattern = re.compile(
+                br"""<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>""", re.IGNORECASE
+            )
+            for m in re.finditer(id_pattern, rawML):
+                [filename, partnum, start, end] = self.getFileInfo(m.start())
+                [seqnum, idtext] = self.getFragTblInfo(m.start())
+                value = fromBase32(m.group(1))
+                logger.debug(
+                    "  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d"
+                    % (m.group(1), value, m.start(), partnum, start, end)
+                )
+                logger.debug("       %s  fragtbl entry %d" % (idtext, seqnum))
+
+        return
+
+    # get information fragment table entry by pos
+    def getFragTblInfo(self, pos):
+        for j in range(len(self.fragtbl)):
+            [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
+            if pos >= insertpos and pos < (insertpos + length):
+                # why are these "in: and before: added here
+                return seqnum, b"in: " + idtext
+            if pos < insertpos:
+                return seqnum, b"before: " + idtext
+        return None, None
+
+    # get information about the part (file) that exists at pos in original rawML
+    def getFileInfo(self, pos):
+        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
+            if pos >= start and pos < end:
+                return filename, partnum, start, end
+        return None, None, None, None
+
+    # accessor functions to properly protect the internal structure
+    def getNumberOfParts(self):
+        return len(self.parts)
+
+    def getPart(self, i):
+        if i >= 0 and i < len(self.parts):
+            return self.parts[i]
+        return None
+
+    def getPartInfo(self, i):
+        if i >= 0 and i < len(self.partinfo):
+            return self.partinfo[i]
+        return None
+
+    def getNumberOfFlows(self):
+        return len(self.flows)
+
+    def getFlow(self, i):
+        # note flows[0] is empty - it was all of the original text
+        if i > 0 and i < len(self.flows):
+            return self.flows[i]
+        return None
+
+    def getFlowInfo(self, i):
+        # note flowinfo[0] is empty - it was all of the original text
+        if i > 0 and i < len(self.flowinfo):
+            return self.flowinfo[i]
+        return None
+
+    def getIDTagByPosFid(self, posfid, offset):
+        # first convert kindle:pos:fid and offset info to position in file
+        # (fromBase32 can handle both string types on input)
+        row = fromBase32(posfid)
+        off = fromBase32(offset)
+        [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
+        pos = insertpos + off
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if fname is None:
+            # pos does not exist
+            # default to skeleton pos instead
+            print(
+                "Link To Position", pos, "does not exist, retargeting to top of target"
+            )
+            pos = self.skeltbl[filenum][3]
+            fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
+        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
+        # some position information encoded into Base32 name.
+        # so find the closest "id=" before position the file  by actually searching in that file
+        idtext = self.getIDTag(pos)
+        return fname, idtext
+
+    def getIDTag(self, pos):
+        # find the first tag with a named anchor (name or id attribute) before pos
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if pn is None and skelpos is None:
+            logger.debug("Error: getIDTag - no file contains %s" % pos)
+        textblock = self.parts[pn]
+        npos = pos - skelpos
+        # if npos inside a tag then search all text before the its end of tag marker
+        pgt = textblock.find(b">", npos)
+        plt = textblock.find(b"<", npos)
+        if plt == npos or pgt < plt:
+            npos = pgt + 1
+        # find id and name attributes only inside of tags
+        # use a reverse tag search since that is faster
+        #    inside any < > pair find "id=" and "name=" attributes return it
+        #    [^>]* means match any amount of chars except for  '>' char
+        #    [^'"] match any amount of chars except for the quote character
+        #    \s* means match any amount of whitespace
+        textblock = textblock[0:npos]
+        id_pattern = re.compile(
+            br"""<[^>]*\sid\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
+        )
+        name_pattern = re.compile(
+            br"""<[^>]*\sname\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
+        )
+        aid_pattern = re.compile(br"""<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]""")
+        for tag in reverse_tag_iter(textblock):
+            # any ids in the body should default to top of file
+            if tag[0:6] == b"<body ":
+                return b""
+            if tag[0:6] != b"<meta ":
+                m = id_pattern.match(tag) or name_pattern.match(tag)
+                if m is not None:
+                    return m.group(1)
+                m = aid_pattern.match(tag)
+                if m is not None:
+                    self.linked_aids.add(m.group(1))
+                    return b"aid-" + m.group(1)
+        return b""
+
+    # do we need to do deep copying
+    def setParts(self, parts):
+        assert len(parts) == len(self.parts)
+        for i in range(len(parts)):
+            self.parts[i] = parts[i]
+
+    # do we need to do deep copying
+    def setFlows(self, flows):
+        assert len(flows) == len(self.flows)
+        for i in range(len(flows)):
+            self.flows[i] = flows[i]
+
+    # get information about the part (file) that exists at pos in original rawML
+    def getSkelInfo(self, pos):
+        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
+            if pos >= start and pos < end:
+                return [partnum, pdir, filename, start, end, aidtext]
+        return [None, None, None, None, None, None]
+
+    # fileno is actually a reference into fragtbl (a fragment)
+    def getGuideText(self):
+        guidetext = b""
+        for [ref_type, ref_title, fileno] in self.guidetbl:
+            if ref_type == b"thumbimagestandard":
+                continue
+            if ref_type not in _guide_types and not ref_type.startswith(b"other."):
+                if ref_type == b"start":
+                    ref_type = b"text"
+                else:
+                    ref_type = b"other." + ref_type
+            [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
+            [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
+            idtext = self.getIDTag(pos)
+            linktgt = filename.encode("utf-8")
+            if idtext != b"":
+                linktgt += b"#" + idtext
+            guidetext += (
+                b'<reference type="'
+                + ref_type
+                + b'" title="'
+                + ref_title
+                + b'" href="'
+                + utf8_str(pdir)
+                + b"/"
+                + linktgt
+                + b'" />\n'
+            )
+        # opf is encoded utf-8 so must convert any titles properly
+        guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
+        return guidetext
+
+    def getPageIDTag(self, pos):
+        # find the first tag with a named anchor (name or id attribute) before pos
+        # but page map offsets need to little more leeway so if the offset points
+        # into a tag look for the next ending tag "/>" or "</" and start your search from there.
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if pn is None and skelpos is None:
+            logger.debug("Error: getIDTag - no file contains %s" % pos)
+        textblock = self.parts[pn]
+        npos = pos - skelpos
+        # if npos inside a tag then search all text before next ending tag
+        pgt = textblock.find(b">", npos)
+        plt = textblock.find(b"<", npos)
+        if plt == npos or pgt < plt:
+            # we are in a tag
+            # so find first ending tag
+            pend1 = textblock.find(b"/>", npos)
+            pend2 = textblock.find(b"</", npos)
+            if pend1 != -1 and pend2 != -1:
+                pend = min(pend1, pend2)
+            else:
+                pend = max(pend1, pend2)
+            if pend != -1:
+                npos = pend
+            else:
+                npos = pgt + 1
+        # find id and name attributes only inside of tags
+        # use a reverse tag search since that is faster
+        #    inside any < > pair find "id=" and "name=" attributes return it
+        #    [^>]* means match any amount of chars except for  '>' char
+        #    [^'"] match any amount of chars except for the quote character
+        #    \s* means match any amount of whitespace
+        textblock = textblock[0:npos]
+        id_pattern = re.compile(
+            br"""<[^>]*\sid\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
+        )
+        name_pattern = re.compile(
+            br"""<[^>]*\sname\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
+        )
+        for tag in reverse_tag_iter(textblock):
+            # any ids in the body should default to top of file
+            if tag[0:6] == b"<body ":
+                return b""
+            if tag[0:6] != b"<meta ":
+                m = id_pattern.match(tag) or name_pattern.match(tag)
+                if m is not None:
+                    return m.group(1)
+        return b""