kindle manager

2024-04-03 15:08:22 +08:00
parent 6b3c0f3b6b
commit 6df3ce42a3
459 changed files with 164651 additions and 4690 deletions
--- a/mobiparse/mobi/mobi_html.py
+++ b/mobiparse/mobi/mobi_html.py
@@ -0,0 +1,516 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, utf8_str
+from loguru import logger
+
+if PY2:
+    range = xrange
+
+import re
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_utils import fromBase32
+
+
+class HTMLProcessor:
+    def __init__(self, files, metadata, rscnames):
+        self.files = files
+        self.metadata = metadata
+        self.rscnames = rscnames
+        # for original style mobis, default to including all image files in the opf manifest
+        self.used = {}
+        for name in rscnames:
+            self.used[name] = "used"
+
+    def findAnchors(self, rawtext, indx_data, positionMap):
+        # process the raw text
+        # find anchors...
+        logger.debug("Find link anchors")
+        link_pattern = re.compile(
+            br"""<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>""", re.IGNORECASE
+        )
+        # TEST NCX: merge in filepos from indx
+        pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
+        if indx_data:
+            pos_indx = [e["pos"] for e in indx_data if e["pos"] > 0]
+            pos_links = list(set(pos_links + pos_indx))
+
+        for position in pos_links:
+            if position in positionMap:
+                positionMap[position] = positionMap[position] + utf8_str(
+                    '<a id="filepos%d" />' % position
+                )
+            else:
+                positionMap[position] = utf8_str('<a id="filepos%d" />' % position)
+
+        # apply dictionary metadata and anchors
+        logger.debug("Insert data into html")
+        pos = 0
+        lastPos = len(rawtext)
+        dataList = []
+        for end in sorted(positionMap.keys()):
+            if end == 0 or end > lastPos:
+                continue  # something's up - can't put a tag in outside <html>...</html>
+            dataList.append(rawtext[pos:end])
+            dataList.append(positionMap[end])
+            pos = end
+        dataList.append(rawtext[pos:])
+        srctext = b"".join(dataList)
+        rawtext = None
+        dataList = None
+        self.srctext = srctext
+        self.indx_data = indx_data
+        return srctext
+
+    def insertHREFS(self):
+        srctext = self.srctext
+        rscnames = self.rscnames
+        metadata = self.metadata
+
+        # put in the hrefs
+        logger.debug("Insert hrefs into html")
+        # There doesn't seem to be a standard, so search as best as we can
+
+        link_pattern = re.compile(
+            br"""<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>""", re.IGNORECASE
+        )
+        srctext = link_pattern.sub(br"""<a\1href="#filepos\2"\3>""", srctext)
+
+        # remove empty anchors
+        logger.debug("Remove empty anchors from html")
+        srctext = re.sub(br"<a\s*/>", br"", srctext)
+        srctext = re.sub(br"<a\s*>\s*</a>", br"", srctext)
+
+        # convert image references
+        logger.debug("Insert image references into html")
+        # split string into image tag pieces and other pieces
+        image_pattern = re.compile(br"""(<img.*?>)""", re.IGNORECASE)
+        image_index_pattern = re.compile(
+            br"""recindex=['"]{0,1}([0-9]+)['"]{0,1}""", re.IGNORECASE
+        )
+        srcpieces = image_pattern.split(srctext)
+        srctext = self.srctext = None
+
+        # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
+        for i in range(1, len(srcpieces), 2):
+            tag = srcpieces[i]
+            for m in image_index_pattern.finditer(tag):
+                imageNumber = int(m.group(1))
+                imageName = rscnames[imageNumber - 1]
+                if imageName is None:
+                    logger.debug(
+                        "Error: Referenced image %s was not recognized as a valid image"
+                        % imageNumber
+                    )
+                else:
+                    replacement = b'src="Images/' + utf8_str(imageName) + b'"'
+                    tag = image_index_pattern.sub(replacement, tag, 1)
+            srcpieces[i] = tag
+        srctext = b"".join(srcpieces)
+
+        # add in character set meta into the html header if needed
+        if "Codec" in metadata:
+            srctext = (
+                srctext[0:12]
+                + b'<meta http-equiv="content-type" content="text/html; charset='
+                + utf8_str(metadata.get("Codec")[0])
+                + b'" />'
+                + srctext[12:]
+            )
+        return srctext, self.used
+
+
+class XHTMLK8Processor:
+    def __init__(self, rscnames, k8proc):
+        self.rscnames = rscnames
+        self.k8proc = k8proc
+        self.used = {}
+
+    def buildXHTML(self):
+
+        # first need to update all links that are internal which
+        # are based on positions within the xhtml files **BEFORE**
+        # cutting and pasting any pieces into the xhtml text files
+
+        #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
+        #       XXXX is the offset in records into divtbl
+        #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
+
+        # pos:fid pattern
+        posfid_pattern = re.compile(br"""(<a.*?href=.*?>)""", re.IGNORECASE)
+        posfid_index_pattern = re.compile(
+            br"""['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']"""
+        )
+
+        parts = []
+        logger.debug("Building proper xhtml for each file")
+        for i in range(self.k8proc.getNumberOfParts()):
+            part = self.k8proc.getPart(i)
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
+
+            # internal links
+            srcpieces = posfid_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<"):
+                    for m in posfid_index_pattern.finditer(tag):
+                        posfid = m.group(1)
+                        offset = m.group(2)
+                        filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
+                        if idtag == b"":
+                            replacement = b'"' + utf8_str(filename) + b'"'
+                        else:
+                            replacement = (
+                                b'"' + utf8_str(filename) + b"#" + idtag + b'"'
+                            )
+                        tag = posfid_index_pattern.sub(replacement, tag, 1)
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            parts.append(part)
+
+        # we are free to cut and paste as we see fit
+        # we can safely remove all of the Kindlegen generated aid tags
+        # change aid ids that are in k8proc.linked_aids to xhtml ids
+        find_tag_with_aid_pattern = re.compile(
+            br"""(<[^>]*\said\s*=[^>]*>)""", re.IGNORECASE
+        )
+        within_tag_aid_position_pattern = re.compile(br"""\said\s*=['"]([^'"]*)['"]""")
+        for i in range(len(parts)):
+            part = parts[i]
+            srcpieces = find_tag_with_aid_pattern.split(part)
+            for j in range(len(srcpieces)):
+                tag = srcpieces[j]
+                if tag.startswith(b"<"):
+                    for m in within_tag_aid_position_pattern.finditer(tag):
+                        try:
+                            aid = m.group(1)
+                        except IndexError:
+                            aid = None
+                        replacement = b""
+                        if aid in self.k8proc.linked_aids:
+                            replacement = b' id="aid-' + aid + b'"'
+                        tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            parts[i] = part
+
+        # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
+        # with page-break-after style patterns
+        find_tag_with_AmznPageBreak_pattern = re.compile(
+            br"""(<[^>]*\sdata-AmznPageBreak=[^>]*>)""", re.IGNORECASE
+        )
+        within_tag_AmznPageBreak_position_pattern = re.compile(
+            br"""\sdata-AmznPageBreak=['"]([^'"]*)['"]"""
+        )
+        for i in range(len(parts)):
+            part = parts[i]
+            srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
+            for j in range(len(srcpieces)):
+                tag = srcpieces[j]
+                if tag.startswith(b"<"):
+                    srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
+                        lambda m: b' style="page-break-after:' + m.group(1) + b'"', tag
+                    )
+            part = b"".join(srcpieces)
+            parts[i] = part
+
+        # we have to handle substitutions for the flows  pieces first as they may
+        # be inlined into the xhtml text
+        #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+        #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+        #   kindle:embed:XXXX   (used for fonts)
+
+        flows = []
+        flows.append(None)
+        flowinfo = []
+        flowinfo.append([None, None, None, None])
+
+        # regular expression search patterns
+        img_pattern = re.compile(br"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE)
+        img_index_pattern = re.compile(
+            br"""[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]""", re.IGNORECASE
+        )
+
+        tag_pattern = re.compile(br"""(<[^>]*>)""")
+        flow_pattern = re.compile(
+            br"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE
+        )
+
+        url_pattern = re.compile(br"""(url\(.*?\))""", re.IGNORECASE)
+        url_img_index_pattern = re.compile(
+            br"""[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]""",
+            re.IGNORECASE,
+        )
+        font_index_pattern = re.compile(
+            br"""[('"]kindle:embed:([0-9|A-V]+)["')]""", re.IGNORECASE
+        )
+        url_css_index_pattern = re.compile(
+            br"""kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*""", re.IGNORECASE
+        )
+        url_svg_image_pattern = re.compile(
+            br"""kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*""", re.IGNORECASE
+        )
+
+        for i in range(1, self.k8proc.getNumberOfFlows()):
+            [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
+            flowpart = self.k8proc.getFlow(i)
+
+            # links to raster image files from image tags
+            # image_pattern
+            srcpieces = img_pattern.split(flowpart)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<im"):
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber - 1]
+                        if imageName is not None:
+                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
+                            self.used[imageName] = "used"
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            logger.debug(
+                                "Error: Referenced image %s was not recognized as a valid image in %s"
+                                % (imageNumber, tag)
+                            )
+                    srcpieces[j] = tag
+            flowpart = b"".join(srcpieces)
+
+            # replacements inside css url():
+            srcpieces = url_pattern.split(flowpart)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+
+                #  process links to raster image files
+                for m in url_img_index_pattern.finditer(tag):
+                    imageNumber = fromBase32(m.group(1))
+                    imageName = self.rscnames[imageNumber - 1]
+                    osep = m.group()[0:1]
+                    csep = m.group()[-1:]
+                    if imageName is not None:
+                        replacement = osep + b"../Images/" + utf8_str(imageName) + csep
+                        self.used[imageName] = "used"
+                        tag = url_img_index_pattern.sub(replacement, tag, 1)
+                    else:
+                        logger.debug(
+                            "Error: Referenced image %s was not recognized as a valid image in %s"
+                            % (imageNumber, tag)
+                        )
+
+                # process links to fonts
+                for m in font_index_pattern.finditer(tag):
+                    fontNumber = fromBase32(m.group(1))
+                    fontName = self.rscnames[fontNumber - 1]
+                    osep = m.group()[0:1]
+                    csep = m.group()[-1:]
+                    if fontName is None:
+                        logger.debug(
+                            "Error: Referenced font %s was not recognized as a valid font in %s"
+                            % (fontNumber, tag)
+                        )
+                    else:
+                        replacement = osep + b"../Fonts/" + utf8_str(fontName) + csep
+                        tag = font_index_pattern.sub(replacement, tag, 1)
+                        self.used[fontName] = "used"
+
+                # process links to other css pieces
+                for m in url_css_index_pattern.finditer(tag):
+                    num = fromBase32(m.group(1))
+                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                    replacement = b'"../' + utf8_str(pdir) + b"/" + utf8_str(fnm) + b'"'
+                    tag = url_css_index_pattern.sub(replacement, tag, 1)
+                    self.used[fnm] = "used"
+
+                # process links to svg images
+                for m in url_svg_image_pattern.finditer(tag):
+                    num = fromBase32(m.group(1))
+                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                    replacement = b'"../' + utf8_str(pdir) + b"/" + utf8_str(fnm) + b'"'
+                    tag = url_svg_image_pattern.sub(replacement, tag, 1)
+                    self.used[fnm] = "used"
+
+                srcpieces[j] = tag
+            flowpart = b"".join(srcpieces)
+
+            # store away in our own copy
+            flows.append(flowpart)
+
+            # I do not think this case exists and even if it does exist, it needs to be done in a separate
+            # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
+            # target one has been fully processed
+
+            # but keep it around if it ends up we do need it
+
+            # flow pattern not inside url()
+            # srcpieces = tag_pattern.split(flowpart)
+            # for j in range(1, len(srcpieces),2):
+            #     tag = srcpieces[j]
+            #     if tag.startswith(b'<'):
+            #         for m in flow_pattern.finditer(tag):
+            #             num = fromBase32(m.group(1))
+            #             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+            #             flowtext = self.k8proc.getFlow(num)
+            #             if fmt == b'inline':
+            #                 tag = flowtext
+            #             else:
+            #                 replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+            #                 tag = flow_pattern.sub(replacement, tag, 1)
+            #                 self.used[fnm] = 'used'
+            #         srcpieces[j] = tag
+            # flowpart = b"".join(srcpieces)
+
+        # now handle the main text xhtml parts
+
+        # Handle the flow items in the XHTML text pieces
+        # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+        tag_pattern = re.compile(br"""(<[^>]*>)""")
+        flow_pattern = re.compile(
+            br"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE
+        )
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+            # flow pattern
+            srcpieces = tag_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<"):
+                    for m in flow_pattern.finditer(tag):
+                        num = fromBase32(m.group(1))
+                        if num > 0 and num < len(self.k8proc.flowinfo):
+                            [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                            flowpart = flows[num]
+                            if fmt == b"inline":
+                                tag = flowpart
+                            else:
+                                replacement = (
+                                    b'"../'
+                                    + utf8_str(pdir)
+                                    + b"/"
+                                    + utf8_str(fnm)
+                                    + b'"'
+                                )
+                                tag = flow_pattern.sub(replacement, tag, 1)
+                                self.used[fnm] = "used"
+                        else:
+                            print(
+                                "warning: ignoring non-existent flow link",
+                                tag,
+                                " value 0x%x" % num,
+                            )
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+
+            # store away modified version
+            parts[i] = part
+
+        # Handle any embedded raster images links in style= attributes urls
+        style_pattern = re.compile(
+            br"""(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)""", re.IGNORECASE
+        )
+        img_index_pattern = re.compile(
+            br"""[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]""", re.IGNORECASE
+        )
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # replace urls in style attributes
+            srcpieces = style_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if b"kindle:embed" in tag:
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber - 1]
+                        osep = m.group()[0:1]
+                        csep = m.group()[-1:]
+                        if imageName is not None:
+                            replacement = (
+                                osep + b"../Images/" + utf8_str(imageName) + csep
+                            )
+                            self.used[imageName] = "used"
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            logger.debug(
+                                "Error: Referenced image %s in style url was not recognized in %s"
+                                % (imageNumber, tag)
+                            )
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+
+            # store away modified version
+            parts[i] = part
+
+        # Handle any embedded raster images links in the xhtml text
+        # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+        img_pattern = re.compile(br"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE)
+        img_index_pattern = re.compile(br"""['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]""")
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # links to raster image files
+            # image_pattern
+            srcpieces = img_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<im"):
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber - 1]
+                        if imageName is not None:
+                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
+                            self.used[imageName] = "used"
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            logger.debug(
+                                "Error: Referenced image %s was not recognized as a valid image in %s"
+                                % (imageNumber, tag)
+                            )
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            # store away modified version
+            parts[i] = part
+
+        # finally perform any general cleanups needed to make valid XHTML
+        # these include:
+        #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
+        #   in svg tags replace "viewbox" attributes with "viewBox"
+        #   in <li> remove value="XX" attributes since these are illegal
+        tag_pattern = re.compile(br"""(<[^>]*>)""")
+        li_value_pattern = re.compile(
+            br"""\svalue\s*=\s*['"][^'"]*['"]""", re.IGNORECASE
+        )
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # tag pattern
+            srcpieces = tag_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<svg") or tag.startswith(b"<SVG"):
+                    tag = tag.replace(b"preserveaspectratio", b"preserveAspectRatio")
+                    tag = tag.replace(b"viewbox", b"viewBox")
+                elif tag.startswith(b"<li ") or tag.startswith(b"<LI "):
+                    tagpieces = li_value_pattern.split(tag)
+                    tag = b"".join(tagpieces)
+                srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            # store away modified version
+            parts[i] = part
+
+        self.k8proc.setFlows(flows)
+        self.k8proc.setParts(parts)
+
+        return self.used