kindle manager

2021-08-25 17:58:31 +08:00
parent 5f0c0a9724
commit 6b3c0f3b6b
303 changed files with 87829 additions and 42537 deletions
--- a/mobimaster/mobi/mobiml2xhtml.py
+++ b/mobimaster/mobi/mobiml2xhtml.py
@@ -0,0 +1,585 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+
+# this program works in concert with the output from KindleUnpack
+
+"""
+Convert from Mobi ML to XHTML
+"""
+
+import os
+import sys
+import re
+
+SPECIAL_HANDLING_TAGS = {
+    "?xml": ("xmlheader", -1),
+    "!--": ("comment", -3),
+    "!DOCTYPE": ("doctype", -1),
+}
+
+SPECIAL_HANDLING_TYPES = ["xmlheader", "doctype", "comment"]
+
+SELF_CLOSING_TAGS = [
+    "br",
+    "hr",
+    "input",
+    "img",
+    "image",
+    "meta",
+    "spacer",
+    "link",
+    "frame",
+    "base",
+    "col",
+    "reference",
+]
+
+
+class MobiMLConverter(object):
+
+    PAGE_BREAK_PAT = re.compile(r"(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+", re.IGNORECASE)
+    IMAGE_ATTRS = ("lowrecindex", "recindex", "hirecindex")
+
+    def __init__(self, filename):
+        self.base_css_rules = "blockquote { margin: 0em 0em 0em 1.25em }\n"
+        self.base_css_rules += "p { margin: 0em }\n"
+        self.base_css_rules += ".bold { font-weight: bold }\n"
+        self.base_css_rules += ".italic { font-style: italic }\n"
+        self.base_css_rules += (
+            ".mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n"
+        )
+        self.tag_css_rules = {}
+        self.tag_css_rule_cnt = 0
+        self.path = []
+        self.filename = filename
+        self.wipml = open(self.filename, "rb").read()
+        self.pos = 0
+        self.opfname = self.filename.rsplit(".", 1)[0] + ".opf"
+        self.opos = 0
+        self.meta = ""
+        self.cssname = os.path.join(os.path.dirname(self.filename), "styles.css")
+        self.current_font_size = 3
+        self.font_history = []
+
+    def cleanup_html(self):
+        self.wipml = re.sub(
+            r'<div height="0(pt|px|ex|em|%){0,1}"></div>', "", self.wipml
+        )
+        self.wipml = self.wipml.replace("\r\n", "\n")
+        self.wipml = self.wipml.replace("> <", ">\n<")
+        self.wipml = self.wipml.replace("<mbp: ", "<mbp:")
+        # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
+        self.wipml = self.wipml.replace("<br></br>", "<br/>")
+
+    def replace_page_breaks(self):
+        self.wipml = self.PAGE_BREAK_PAT.sub(
+            '<div class="mbp_pagebreak" />', self.wipml
+        )
+
+    # parse leading text of ml and tag
+    def parseml(self):
+        p = self.pos
+        if p >= len(self.wipml):
+            return None
+        if self.wipml[p] != "<":
+            res = self.wipml.find("<", p)
+            if res == -1:
+                res = len(self.wipml)
+            self.pos = res
+            return self.wipml[p:res], None
+        # handle comment as a special case to deal with multi-line comments
+        if self.wipml[p : p + 4] == "<!--":
+            te = self.wipml.find("-->", p + 1)
+            if te != -1:
+                te = te + 2
+        else:
+            te = self.wipml.find(">", p + 1)
+            ntb = self.wipml.find("<", p + 1)
+            if ntb != -1 and ntb < te:
+                self.pos = ntb
+                return self.wipml[p:ntb], None
+        self.pos = te + 1
+        return None, self.wipml[p : te + 1]
+
+    # parses string version of tag to identify its name,
+    # its type 'begin', 'end' or 'single',
+    # plus build a hashtable of its attributes
+    # code is written to handle the possiblity of very poor formating
+    def parsetag(self, s):
+        p = 1
+        # get the tag name
+        tname = None
+        ttype = None
+        tattr = {}
+        while s[p : p + 1] == " ":
+            p += 1
+        if s[p : p + 1] == "/":
+            ttype = "end"
+            p += 1
+            while s[p : p + 1] == " ":
+                p += 1
+        b = p
+        while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"):
+            p += 1
+        tname = s[b:p].lower()
+        if tname == "!doctype":
+            tname = "!DOCTYPE"
+        # special cases
+        if tname in SPECIAL_HANDLING_TAGS.keys():
+            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
+            tattr["special"] = s[p:backstep]
+        if ttype is None:
+            # parse any attributes
+            while s.find("=", p) != -1:
+                while s[p : p + 1] == " ":
+                    p += 1
+                b = p
+                while s[p : p + 1] != "=":
+                    p += 1
+                aname = s[b:p].lower()
+                aname = aname.rstrip(" ")
+                p += 1
+                while s[p : p + 1] == " ":
+                    p += 1
+                if s[p : p + 1] in ('"', "'"):
+                    p = p + 1
+                    b = p
+                    while s[p : p + 1] not in ('"', "'"):
+                        p += 1
+                    val = s[b:p]
+                    p += 1
+                else:
+                    b = p
+                    while s[p : p + 1] not in (">", "/", " "):
+                        p += 1
+                    val = s[b:p]
+                tattr[aname] = val
+        # label beginning and single tags
+        if ttype is None:
+            ttype = "begin"
+            if s.find(" /", p) >= 0:
+                ttype = "single_ext"
+            elif s.find("/", p) >= 0:
+                ttype = "single"
+        return ttype, tname, tattr
+
+    # main routine to convert from mobi markup language to html
+    def processml(self):
+
+        # are these really needed
+        html_done = False
+        head_done = False
+        body_done = False
+
+        skip = False
+
+        htmlstr = ""
+        self.replace_page_breaks()
+        self.cleanup_html()
+
+        # now parse the cleaned up ml into standard xhtml
+        while True:
+
+            r = self.parseml()
+            if not r:
+                break
+
+            text, tag = r
+
+            if text:
+                if not skip:
+                    htmlstr += text
+
+            if tag:
+                ttype, tname, tattr = self.parsetag(tag)
+
+                # If we run into a DTD or xml declarations inside the body ... bail.
+                if (
+                    tname in SPECIAL_HANDLING_TAGS.keys()
+                    and tname != "comment"
+                    and body_done
+                ):
+                    htmlstr += "\n</body></html>"
+                    break
+
+                # make sure self-closing tags actually self-close
+                if ttype == "begin" and tname in SELF_CLOSING_TAGS:
+                    ttype = "single"
+
+                # make sure any end tags of self-closing tags are discarded
+                if ttype == "end" and tname in SELF_CLOSING_TAGS:
+                    continue
+
+                # remove embedded guide and refernces from old mobis
+                if tname in ("guide", "ncx", "reference") and ttype in (
+                    "begin",
+                    "single",
+                    "single_ext",
+                ):
+                    tname = "removeme:{0}".format(tname)
+                    tattr = None
+                if (
+                    tname in ("guide", "ncx", "reference", "font", "span")
+                    and ttype == "end"
+                ):
+                    if self.path[-1] == "removeme:{0}".format(tname):
+                        tname = "removeme:{0}".format(tname)
+                        tattr = None
+
+                # Get rid of font tags that only have a color attribute.
+                if tname == "font" and ttype in ("begin", "single", "single_ext"):
+                    if "color" in tattr.keys() and len(tattr.keys()) == 1:
+                        tname = "removeme:{0}".format(tname)
+                        tattr = None
+
+                # Get rid of empty spans in the markup.
+                if (
+                    tname == "span"
+                    and ttype in ("begin", "single", "single_ext")
+                    and not len(tattr)
+                ):
+                    tname = "removeme:{0}".format(tname)
+
+                # need to handle fonts outside of the normal methods
+                # so fonts tags won't be added to the self.path since we keep track
+                # of font tags separately with self.font_history
+                if tname == "font" and ttype == "begin":
+                    # check for nested font start tags
+                    if len(self.font_history) > 0:
+                        # inject a font end tag
+                        taginfo = ("end", "font", None)
+                        htmlstr += self.processtag(taginfo)
+                    self.font_history.append((ttype, tname, tattr))
+                    # handle the current font start tag
+                    taginfo = (ttype, tname, tattr)
+                    htmlstr += self.processtag(taginfo)
+                    continue
+
+                # check for nested font tags and unnest them
+                if tname == "font" and ttype == "end":
+                    self.font_history.pop()
+                    # handle this font end tag
+                    taginfo = ("end", "font", None)
+                    htmlstr += self.processtag(taginfo)
+                    # check if we were nested
+                    if len(self.font_history) > 0:
+                        # inject a copy of the most recent font start tag from history
+                        taginfo = self.font_history[-1]
+                        htmlstr += self.processtag(taginfo)
+                    continue
+
+                # keep track of nesting path
+                if ttype == "begin":
+                    self.path.append(tname)
+                elif ttype == "end":
+                    if tname != self.path[-1]:
+                        print ("improper nesting: ", self.path, tname, ttype)
+                        if tname not in self.path:
+                            # handle case of end tag with no beginning by injecting empty begin tag
+                            taginfo = ("begin", tname, None)
+                            htmlstr += self.processtag(taginfo)
+                            print "     - fixed by injecting empty start tag ", tname
+                            self.path.append(tname)
+                        elif len(self.path) > 1 and tname == self.path[-2]:
+                            # handle case of dangling missing end
+                            taginfo = ("end", self.path[-1], None)
+                            htmlstr += self.processtag(taginfo)
+                            print "     - fixed by injecting end tag ", self.path[-1]
+                            self.path.pop()
+                    self.path.pop()
+
+                if tname == "removeme:{0}".format(tname):
+                    if ttype in ("begin", "single", "single_ext"):
+                        skip = True
+                    else:
+                        skip = False
+                else:
+                    taginfo = (ttype, tname, tattr)
+                    htmlstr += self.processtag(taginfo)
+
+                # handle potential issue of multiple html, head, and body sections
+                if tname == "html" and ttype == "begin" and not html_done:
+                    htmlstr += "\n"
+                    html_done = True
+
+                if tname == "head" and ttype == "begin" and not head_done:
+                    htmlstr += "\n"
+                    # also add in metadata and style link tags
+                    htmlstr += self.meta
+                    htmlstr += (
+                        '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+                    )
+                    head_done = True
+
+                if tname == "body" and ttype == "begin" and not body_done:
+                    htmlstr += "\n"
+                    body_done = True
+
+        # handle issue of possibly missing html, head, and body tags
+        # I have not seen this but the original did something like this so ...
+        if not body_done:
+            htmlstr = "<body>\n" + htmlstr + "</body>\n"
+        if not head_done:
+            headstr = "<head>\n"
+            headstr += self.meta
+            headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+            headstr += "</head>\n"
+            htmlstr = headstr + htmlstr
+        if not html_done:
+            htmlstr = "<html>\n" + htmlstr + "</html>\n"
+
+        # finally add DOCTYPE info
+        htmlstr = (
+            '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+            + htmlstr
+        )
+
+        css = self.base_css_rules
+        for cls, rule in self.tag_css_rules.items():
+            css += ".%s { %s }\n" % (cls, rule)
+
+        return (htmlstr, css, self.cssname)
+
+    def ensure_unit(self, raw, unit="px"):
+        if re.search(r"\d+$", raw) is not None:
+            raw += unit
+        return raw
+
+    # flatten possibly modified tag back to string
+    def taginfo_tostring(self, taginfo):
+        (ttype, tname, tattr) = taginfo
+        if ttype is None or tname is None:
+            return ""
+        if ttype == "end":
+            return "</%s>" % tname
+        if (
+            ttype in SPECIAL_HANDLING_TYPES
+            and tattr is not None
+            and "special" in tattr.keys()
+        ):
+            info = tattr["special"]
+            if ttype == "comment":
+                return "<%s %s-->" % tname, info
+            else:
+                return "<%s %s>" % tname, info
+        res = []
+        res.append("<%s" % tname)
+        if tattr is not None:
+            for key in tattr.keys():
+                res.append(' %s="%s"' % (key, tattr[key]))
+        if ttype == "single":
+            res.append("/>")
+        elif ttype == "single_ext":
+            res.append(" />")
+        else:
+            res.append(">")
+        return "".join(res)
+
+    # routines to convert from mobi ml tags atributes to xhtml attributes and styles
+    def processtag(self, taginfo):
+        # Converting mobi font sizes to numerics
+        size_map = {
+            "xx-small": "1",
+            "x-small": "2",
+            "small": "3",
+            "medium": "4",
+            "large": "5",
+            "x-large": "6",
+            "xx-large": "7",
+        }
+
+        size_to_em_map = {
+            "1": ".65em",
+            "2": ".75em",
+            "3": "1em",
+            "4": "1.125em",
+            "5": "1.25em",
+            "6": "1.5em",
+            "7": "2em",
+        }
+
+        # current tag to work on
+        (ttype, tname, tattr) = taginfo
+        if not tattr:
+            tattr = {}
+
+        styles = []
+
+        if tname is None or tname.startswith("removeme"):
+            return ""
+
+        # have not seen an example of this yet so keep it here to be safe
+        # until this is better understood
+        if tname in (
+            "country-region",
+            "place",
+            "placetype",
+            "placename",
+            "state",
+            "city",
+            "street",
+            "address",
+            "content",
+        ):
+            tname = "div" if tname == "content" else "span"
+            for key in tattr.keys():
+                tattr.pop(key)
+
+        # handle general case of style, height, width, bgcolor in any tag
+        if "style" in tattr.keys():
+            style = tattr.pop("style").strip()
+            if style:
+                styles.append(style)
+
+        if "align" in tattr.keys():
+            align = tattr.pop("align").strip()
+            if align:
+                if tname in ("table", "td", "tr"):
+                    pass
+                else:
+                    styles.append("text-align: %s" % align)
+
+        if "height" in tattr.keys():
+            height = tattr.pop("height").strip()
+            if (
+                height
+                and "<" not in height
+                and ">" not in height
+                and re.search(r"\d+", height)
+            ):
+                if tname in ("table", "td", "tr"):
+                    pass
+                elif tname == "img":
+                    tattr["height"] = height
+                else:
+                    styles.append("margin-top: %s" % self.ensure_unit(height))
+
+        if "width" in tattr.keys():
+            width = tattr.pop("width").strip()
+            if width and re.search(r"\d+", width):
+                if tname in ("table", "td", "tr"):
+                    pass
+                elif tname == "img":
+                    tattr["width"] = width
+                else:
+                    styles.append("text-indent: %s" % self.ensure_unit(width))
+                    if width.startswith("-"):
+                        styles.append("margin-left: %s" % self.ensure_unit(width[1:]))
+
+        if "bgcolor" in tattr.keys():
+            # no proprietary html allowed
+            if tname == "div":
+                del tattr["bgcolor"]
+
+        elif tname == "font":
+            # Change font tags to span tags
+            tname = "span"
+            if ttype in ("begin", "single", "single_ext"):
+                # move the face attribute to css font-family
+                if "face" in tattr.keys():
+                    face = tattr.pop("face").strip()
+                    styles.append('font-family: "%s"' % face)
+
+                    # Monitor the constantly changing font sizes, change them to ems and move
+                    # them to css. The following will work for 'flat' font tags, but nested font tags
+                    # will cause things to go wonky. Need to revert to the parent font tag's size
+                    # when a closing tag is encountered.
+                if "size" in tattr.keys():
+                    sz = tattr.pop("size").strip().lower()
+                    try:
+                        float(sz)
+                    except ValueError:
+                        if sz in size_map.keys():
+                            sz = size_map[sz]
+                    else:
+                        if sz.startswith("-") or sz.startswith("+"):
+                            sz = self.current_font_size + float(sz)
+                            if sz > 7:
+                                sz = 7
+                            elif sz < 1:
+                                sz = 1
+                            sz = str(int(sz))
+                    styles.append("font-size: %s" % size_to_em_map[sz])
+                    self.current_font_size = int(sz)
+
+        elif tname == "img":
+            for attr in ("width", "height"):
+                if attr in tattr:
+                    val = tattr[attr]
+                    if val.lower().endswith("em"):
+                        try:
+                            nval = float(val[:-2])
+                            nval *= 16 * (
+                                168.451 / 72
+                            )  # Assume this was set using the Kindle profile
+                            tattr[attr] = "%dpx" % int(nval)
+                        except:
+                            del tattr[attr]
+                    elif val.lower().endswith("%"):
+                        del tattr[attr]
+
+        # convert the anchor tags
+        if "filepos-id" in tattr:
+            tattr["id"] = tattr.pop("filepos-id")
+            if "name" in tattr and tattr["name"] != tattr["id"]:
+                tattr["name"] = tattr["id"]
+
+        if "filepos" in tattr:
+            filepos = tattr.pop("filepos")
+            try:
+                tattr["href"] = "#filepos%d" % int(filepos)
+            except ValueError:
+                pass
+
+        if styles:
+            ncls = None
+            rule = "; ".join(styles)
+            for sel, srule in self.tag_css_rules.items():
+                if srule == rule:
+                    ncls = sel
+                    break
+            if ncls is None:
+                self.tag_css_rule_cnt += 1
+                ncls = "rule_%d" % self.tag_css_rule_cnt
+                self.tag_css_rules[ncls] = rule
+            cls = tattr.get("class", "")
+            cls = cls + (" " if cls else "") + ncls
+            tattr["class"] = cls
+
+        # convert updated tag back to string representation
+        if len(tattr) == 0:
+            tattr = None
+        taginfo = (ttype, tname, tattr)
+        return self.taginfo_tostring(taginfo)
+
+
+""" main only left in for testing outside of plugin """
+
+
+def main(argv=sys.argv):
+    if len(argv) != 2:
+        return 1
+    else:
+        infile = argv[1]
+
+    try:
+        print "Converting Mobi Markup Language to XHTML"
+        mlc = MobiMLConverter(infile)
+        print "Processing ..."
+        htmlstr, css, cssname = mlc.processml()
+        outname = infile.rsplit(".", 1)[0] + "_converted.html"
+        file(outname, "wb").write(htmlstr)
+        file(cssname, "wb").write(css)
+        print "Completed"
+        print "XHTML version of book can be found at: " + outname
+
+    except ValueError, e:
+        print "Error: %s" % e
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())