kindle manager

2024-04-03 15:08:22 +08:00
parent 6b3c0f3b6b
commit 6df3ce42a3
459 changed files with 164651 additions and 4690 deletions
--- a/mobiparse/mobi/mobi_k8resc.py
+++ b/mobiparse/mobi/mobi_k8resc.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
+""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
+
+if DEBUG_USE_ORDERED_DICTIONARY:
+    from collections import OrderedDict as dict_
+else:
+    dict_ = dict
+
+from .compatibility_utils import unicode_str
+from loguru import logger
+
+from .mobi_utils import fromBase32
+
+_OPF_PARENT_TAGS = [
+    "xml",
+    "package",
+    "metadata",
+    "dc-metadata",
+    "x-metadata",
+    "manifest",
+    "spine",
+    "tours",
+    "guide",
+]
+
+
+class K8RESCProcessor(object):
+    def __init__(self, data, debug=False):
+        self._debug = debug
+        self.resc = None
+        self.opos = 0
+        self.extrameta = []
+        self.cover_name = None
+        self.spine_idrefs = {}
+        self.spine_order = []
+        self.spine_pageattributes = {}
+        self.spine_ppd = None
+        # need3 indicate the book has fields which require epub3.
+        # but the estimation of the source epub version from the fields is difficult.
+        self.need3 = False
+        self.package_ver = None
+        self.extra_metadata = []
+        self.refines_metadata = []
+        self.extra_attributes = []
+        # get header
+        start_pos = data.find(b"<")
+        self.resc_header = data[:start_pos]
+        # get resc data length
+        start = self.resc_header.find(b"=") + 1
+        end = self.resc_header.find(b"&", start)
+        resc_size = 0
+        if end > 0:
+            resc_size = fromBase32(self.resc_header[start:end])
+        resc_rawbytes = len(data) - start_pos
+        if resc_rawbytes == resc_size:
+            self.resc_length = resc_size
+        else:
+            # Most RESC has a nul string at its tail but some do not.
+            end_pos = data.find(b"\x00", start_pos)
+            if end_pos < 0:
+                self.resc_length = resc_rawbytes
+            else:
+                self.resc_length = end_pos - start_pos
+        if self.resc_length != resc_size:
+            logger.debug(
+                "Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(
+                    self.resc_length, resc_size
+                )
+            )
+        # now parse RESC after converting it to unicode from utf-8
+        self.resc = unicode_str(data[start_pos : start_pos + self.resc_length])
+        self.parseData()
+
+    def prepend_to_spine(self, key, idref, linear, properties):
+        self.spine_order = [key] + self.spine_order
+        self.spine_idrefs[key] = idref
+        attributes = {}
+        if linear is not None:
+            attributes["linear"] = linear
+        if properties is not None:
+            attributes["properties"] = properties
+        self.spine_pageattributes[key] = attributes
+
+    # RESC tag iterator
+    def resc_tag_iter(self):
+        tcontent = last_tattr = None
+        prefix = [""]
+        while True:
+            text, tag = self.parseresc()
+            if text is None and tag is None:
+                break
+            if text is not None:
+                tcontent = text.rstrip(" \r\n")
+            else:  # we have a tag
+                ttype, tname, tattr = self.parsetag(tag)
+                if ttype == "begin":
+                    tcontent = None
+                    prefix.append(tname + ".")
+                    if tname in _OPF_PARENT_TAGS:
+                        yield "".join(prefix), tname, tattr, tcontent
+                    else:
+                        last_tattr = tattr
+                else:  # single or end
+                    if ttype == "end":
+                        prefix.pop()
+                        tattr = last_tattr
+                        last_tattr = None
+                        if tname in _OPF_PARENT_TAGS:
+                            tname += "-end"
+                    yield "".join(prefix), tname, tattr, tcontent
+                    tcontent = None
+
+    # now parse the RESC to extract spine and extra metadata info
+    def parseData(self):
+        for prefix, tname, tattr, tcontent in self.resc_tag_iter():
+            if self._debug:
+                logger.debug(
+                    "   Parsing RESC: %s %s %s %s" % (prefix, tname, tattr, tcontent)
+                )
+            if tname == "package":
+                self.package_ver = tattr.get("version", "2.0")
+                package_prefix = tattr.get("prefix", "")
+                if self.package_ver.startswith("3") or package_prefix.startswith(
+                    "rendition"
+                ):
+                    self.need3 = True
+            if tname == "spine":
+                self.spine_ppd = tattr.get("page-progession-direction", None)
+                if self.spine_ppd is not None and self.spine_ppd == "rtl":
+                    self.need3 = True
+            if tname == "itemref":
+                skelid = tattr.pop("skelid", None)
+                if skelid is None and len(self.spine_order) == 0:
+                    # assume it was removed initial coverpage
+                    skelid = "coverpage"
+                    tattr["linear"] = "no"
+                self.spine_order.append(skelid)
+                idref = tattr.pop("idref", None)
+                if idref is not None:
+                    idref = "x_" + idref
+                self.spine_idrefs[skelid] = idref
+                if "id" in tattr:
+                    del tattr["id"]
+                # tattr["id"] = 'x_' + tattr["id"]
+                if "properties" in tattr:
+                    self.need3 = True
+                self.spine_pageattributes[skelid] = tattr
+            if tname == "meta" or tname.startswith("dc:"):
+                if "refines" in tattr or "property" in tattr:
+                    self.need3 = True
+                if tattr.get("name", "") == "cover":
+                    cover_name = tattr.get("content", None)
+                    if cover_name is not None:
+                        cover_name = "x_" + cover_name
+                    self.cover_name = cover_name
+                else:
+                    self.extrameta.append([tname, tattr, tcontent])
+
+    # parse and return either leading text or the next tag
+    def parseresc(self):
+        p = self.opos
+        if p >= len(self.resc):
+            return None, None
+        if self.resc[p] != "<":
+            res = self.resc.find("<", p)
+            if res == -1:
+                res = len(self.resc)
+            self.opos = res
+            return self.resc[p:res], None
+        # handle comment as a special case
+        if self.resc[p : p + 4] == "<!--":
+            te = self.resc.find("-->", p + 1)
+            if te != -1:
+                te = te + 2
+        else:
+            te = self.resc.find(">", p + 1)
+            ntb = self.resc.find("<", p + 1)
+            if ntb != -1 and ntb < te:
+                self.opos = ntb
+                return self.resc[p:ntb], None
+        self.opos = te + 1
+        return None, self.resc[p : te + 1]
+
+    # parses tag to identify:  [tname, ttype, tattr]
+    #    tname: tag name
+    #    ttype: tag type ('begin', 'end' or 'single');
+    #    tattr: dictionary of tag atributes
+    def parsetag(self, s):
+        p = 1
+        tname = None
+        ttype = None
+        tattr = dict_()
+        while s[p : p + 1] == " ":
+            p += 1
+        if s[p : p + 1] == "/":
+            ttype = "end"
+            p += 1
+            while s[p : p + 1] == " ":
+                p += 1
+        b = p
+        while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"):
+            p += 1
+        tname = s[b:p].lower()
+        # some special cases
+        if tname == "?xml":
+            tname = "xml"
+        if tname == "!--":
+            ttype = "single"
+            comment = s[p:-3].strip()
+            tattr["comment"] = comment
+        if ttype is None:
+            # parse any attributes of begin or single tags
+            while s.find("=", p) != -1:
+                while s[p : p + 1] == " ":
+                    p += 1
+                b = p
+                while s[p : p + 1] != "=":
+                    p += 1
+                aname = s[b:p].lower()
+                aname = aname.rstrip(" ")
+                p += 1
+                while s[p : p + 1] == " ":
+                    p += 1
+                if s[p : p + 1] in ('"', "'"):
+                    p = p + 1
+                    b = p
+                    while s[p : p + 1] not in ('"', "'"):
+                        p += 1
+                    val = s[b:p]
+                    p += 1
+                else:
+                    b = p
+                    while s[p : p + 1] not in (">", "/", " "):
+                        p += 1
+                    val = s[b:p]
+                tattr[aname] = val
+        if ttype is None:
+            ttype = "begin"
+            if s.find("/", p) >= 0:
+                ttype = "single"
+        return ttype, tname, tattr
+
+    def taginfo_toxml(self, taginfo):
+        res = []
+        tname, tattr, tcontent = taginfo
+        res.append("<" + tname)
+        if tattr is not None:
+            for key in tattr:
+                res.append(" " + key + '="' + tattr[key] + '"')
+        if tcontent is not None:
+            res.append(">" + tcontent + "</" + tname + ">\n")
+        else:
+            res.append("/>\n")
+        return "".join(res)
+
+    def hasSpine(self):
+        return len(self.spine_order) > 0
+
+    def needEPUB3(self):
+        return self.need3
+
+    def hasRefines(self):
+        for [tname, tattr, tcontent] in self.extrameta:
+            if "refines" in tattr:
+                return True
+        return False
+
+    def createMetadata(self, epubver):
+        for taginfo in self.extrameta:
+            tname, tattr, tcontent = taginfo
+            if "refines" in tattr:
+                if epubver == "F" and "property" in tattr:
+                    attr = ' id="%s" opf:%s="%s"\n' % (
+                        tattr["refines"],
+                        tattr["property"],
+                        tcontent,
+                    )
+                    self.extra_attributes.append(attr)
+                else:
+                    tag = self.taginfo_toxml(taginfo)
+                    self.refines_metadata.append(tag)
+            else:
+                tag = self.taginfo_toxml(taginfo)
+                self.extra_metadata.append(tag)