#! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # this program works in concert with the output from KindleUnpack """ Convert from Mobi ML to XHTML """ import os import sys import re SPECIAL_HANDLING_TAGS = { "?xml": ("xmlheader", -1), "!--": ("comment", -3), "!DOCTYPE": ("doctype", -1), } SPECIAL_HANDLING_TYPES = ["xmlheader", "doctype", "comment"] SELF_CLOSING_TAGS = [ "br", "hr", "input", "img", "image", "meta", "spacer", "link", "frame", "base", "col", "reference", ] class MobiMLConverter(object): PAGE_BREAK_PAT = re.compile(r"(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+", re.IGNORECASE) IMAGE_ATTRS = ("lowrecindex", "recindex", "hirecindex") def __init__(self, filename): self.base_css_rules = "blockquote { margin: 0em 0em 0em 1.25em }\n" self.base_css_rules += "p { margin: 0em }\n" self.base_css_rules += ".bold { font-weight: bold }\n" self.base_css_rules += ".italic { font-style: italic }\n" self.base_css_rules += ( ".mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n" ) self.tag_css_rules = {} self.tag_css_rule_cnt = 0 self.path = [] self.filename = filename self.wipml = open(self.filename, "rb").read() self.pos = 0 self.opfname = self.filename.rsplit(".", 1)[0] + ".opf" self.opos = 0 self.meta = "" self.cssname = os.path.join(os.path.dirname(self.filename), "styles.css") self.current_font_size = 3 self.font_history = [] def cleanup_html(self): self.wipml = re.sub( r'
', "", self.wipml ) self.wipml = self.wipml.replace("\r\n", "\n") self.wipml = self.wipml.replace("> <", ">\n<") self.wipml = self.wipml.replace("]*>', '', self.wipml) self.wipml = self.wipml.replace("

", "
") def replace_page_breaks(self): self.wipml = self.PAGE_BREAK_PAT.sub( '
', self.wipml ) # parse leading text of ml and tag def parseml(self): p = self.pos if p >= len(self.wipml): return None if self.wipml[p] != "<": res = self.wipml.find("<", p) if res == -1: res = len(self.wipml) self.pos = res return self.wipml[p:res], None # handle comment as a special case to deal with multi-line comments if self.wipml[p : p + 4] == "", p + 1) if te != -1: te = te + 2 else: te = self.wipml.find(">", p + 1) ntb = self.wipml.find("<", p + 1) if ntb != -1 and ntb < te: self.pos = ntb return self.wipml[p:ntb], None self.pos = te + 1 return None, self.wipml[p : te + 1] # parses string version of tag to identify its name, # its type 'begin', 'end' or 'single', # plus build a hashtable of its attributes # code is written to handle the possiblity of very poor formating def parsetag(self, s): p = 1 # get the tag name tname = None ttype = None tattr = {} while s[p : p + 1] == " ": p += 1 if s[p : p + 1] == "/": ttype = "end" p += 1 while s[p : p + 1] == " ": p += 1 b = p while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"): p += 1 tname = s[b:p].lower() if tname == "!doctype": tname = "!DOCTYPE" # special cases if tname in SPECIAL_HANDLING_TAGS.keys(): ttype, backstep = SPECIAL_HANDLING_TAGS[tname] tattr["special"] = s[p:backstep] if ttype is None: # parse any attributes while s.find("=", p) != -1: while s[p : p + 1] == " ": p += 1 b = p while s[p : p + 1] != "=": p += 1 aname = s[b:p].lower() aname = aname.rstrip(" ") p += 1 while s[p : p + 1] == " ": p += 1 if s[p : p + 1] in ('"', "'"): p = p + 1 b = p while s[p : p + 1] not in ('"', "'"): p += 1 val = s[b:p] p += 1 else: b = p while s[p : p + 1] not in (">", "/", " "): p += 1 val = s[b:p] tattr[aname] = val # label beginning and single tags if ttype is None: ttype = "begin" if s.find(" /", p) >= 0: ttype = "single_ext" elif s.find("/", p) >= 0: ttype = "single" return ttype, tname, tattr # main routine to convert from mobi markup language to html def processml(self): # are these really needed html_done = False head_done = False body_done = False skip = False htmlstr = "" self.replace_page_breaks() self.cleanup_html() # now parse the cleaned up ml into standard xhtml while True: r = self.parseml() if not r: break text, tag = r if text: if not skip: htmlstr += text if tag: ttype, tname, tattr = self.parsetag(tag) # If we run into a DTD or xml declarations inside the body ... bail. if ( tname in SPECIAL_HANDLING_TAGS.keys() and tname != "comment" and body_done ): htmlstr += "\n" break # make sure self-closing tags actually self-close if ttype == "begin" and tname in SELF_CLOSING_TAGS: ttype = "single" # make sure any end tags of self-closing tags are discarded if ttype == "end" and tname in SELF_CLOSING_TAGS: continue # remove embedded guide and refernces from old mobis if tname in ("guide", "ncx", "reference") and ttype in ( "begin", "single", "single_ext", ): tname = "removeme:{0}".format(tname) tattr = None if ( tname in ("guide", "ncx", "reference", "font", "span") and ttype == "end" ): if self.path[-1] == "removeme:{0}".format(tname): tname = "removeme:{0}".format(tname) tattr = None # Get rid of font tags that only have a color attribute. if tname == "font" and ttype in ("begin", "single", "single_ext"): if "color" in tattr.keys() and len(tattr.keys()) == 1: tname = "removeme:{0}".format(tname) tattr = None # Get rid of empty spans in the markup. if ( tname == "span" and ttype in ("begin", "single", "single_ext") and not len(tattr) ): tname = "removeme:{0}".format(tname) # need to handle fonts outside of the normal methods # so fonts tags won't be added to the self.path since we keep track # of font tags separately with self.font_history if tname == "font" and ttype == "begin": # check for nested font start tags if len(self.font_history) > 0: # inject a font end tag taginfo = ("end", "font", None) htmlstr += self.processtag(taginfo) self.font_history.append((ttype, tname, tattr)) # handle the current font start tag taginfo = (ttype, tname, tattr) htmlstr += self.processtag(taginfo) continue # check for nested font tags and unnest them if tname == "font" and ttype == "end": self.font_history.pop() # handle this font end tag taginfo = ("end", "font", None) htmlstr += self.processtag(taginfo) # check if we were nested if len(self.font_history) > 0: # inject a copy of the most recent font start tag from history taginfo = self.font_history[-1] htmlstr += self.processtag(taginfo) continue # keep track of nesting path if ttype == "begin": self.path.append(tname) elif ttype == "end": if tname != self.path[-1]: print ("improper nesting: ", self.path, tname, ttype) if tname not in self.path: # handle case of end tag with no beginning by injecting empty begin tag taginfo = ("begin", tname, None) htmlstr += self.processtag(taginfo) print " - fixed by injecting empty start tag ", tname self.path.append(tname) elif len(self.path) > 1 and tname == self.path[-2]: # handle case of dangling missing end taginfo = ("end", self.path[-1], None) htmlstr += self.processtag(taginfo) print " - fixed by injecting end tag ", self.path[-1] self.path.pop() self.path.pop() if tname == "removeme:{0}".format(tname): if ttype in ("begin", "single", "single_ext"): skip = True else: skip = False else: taginfo = (ttype, tname, tattr) htmlstr += self.processtag(taginfo) # handle potential issue of multiple html, head, and body sections if tname == "html" and ttype == "begin" and not html_done: htmlstr += "\n" html_done = True if tname == "head" and ttype == "begin" and not head_done: htmlstr += "\n" # also add in metadata and style link tags htmlstr += self.meta htmlstr += ( '\n' ) head_done = True if tname == "body" and ttype == "begin" and not body_done: htmlstr += "\n" body_done = True # handle issue of possibly missing html, head, and body tags # I have not seen this but the original did something like this so ... if not body_done: htmlstr = "\n" + htmlstr + "\n" if not head_done: headstr = "\n" headstr += self.meta headstr += '\n' headstr += "\n" htmlstr = headstr + htmlstr if not html_done: htmlstr = "\n" + htmlstr + "\n" # finally add DOCTYPE info htmlstr = ( '\n\n' + htmlstr ) css = self.base_css_rules for cls, rule in self.tag_css_rules.items(): css += ".%s { %s }\n" % (cls, rule) return (htmlstr, css, self.cssname) def ensure_unit(self, raw, unit="px"): if re.search(r"\d+$", raw) is not None: raw += unit return raw # flatten possibly modified tag back to string def taginfo_tostring(self, taginfo): (ttype, tname, tattr) = taginfo if ttype is None or tname is None: return "" if ttype == "end": return "" % tname if ( ttype in SPECIAL_HANDLING_TYPES and tattr is not None and "special" in tattr.keys() ): info = tattr["special"] if ttype == "comment": return "<%s %s-->" % tname, info else: return "<%s %s>" % tname, info res = [] res.append("<%s" % tname) if tattr is not None: for key in tattr.keys(): res.append(' %s="%s"' % (key, tattr[key])) if ttype == "single": res.append("/>") elif ttype == "single_ext": res.append(" />") else: res.append(">") return "".join(res) # routines to convert from mobi ml tags atributes to xhtml attributes and styles def processtag(self, taginfo): # Converting mobi font sizes to numerics size_map = { "xx-small": "1", "x-small": "2", "small": "3", "medium": "4", "large": "5", "x-large": "6", "xx-large": "7", } size_to_em_map = { "1": ".65em", "2": ".75em", "3": "1em", "4": "1.125em", "5": "1.25em", "6": "1.5em", "7": "2em", } # current tag to work on (ttype, tname, tattr) = taginfo if not tattr: tattr = {} styles = [] if tname is None or tname.startswith("removeme"): return "" # have not seen an example of this yet so keep it here to be safe # until this is better understood if tname in ( "country-region", "place", "placetype", "placename", "state", "city", "street", "address", "content", ): tname = "div" if tname == "content" else "span" for key in tattr.keys(): tattr.pop(key) # handle general case of style, height, width, bgcolor in any tag if "style" in tattr.keys(): style = tattr.pop("style").strip() if style: styles.append(style) if "align" in tattr.keys(): align = tattr.pop("align").strip() if align: if tname in ("table", "td", "tr"): pass else: styles.append("text-align: %s" % align) if "height" in tattr.keys(): height = tattr.pop("height").strip() if ( height and "<" not in height and ">" not in height and re.search(r"\d+", height) ): if tname in ("table", "td", "tr"): pass elif tname == "img": tattr["height"] = height else: styles.append("margin-top: %s" % self.ensure_unit(height)) if "width" in tattr.keys(): width = tattr.pop("width").strip() if width and re.search(r"\d+", width): if tname in ("table", "td", "tr"): pass elif tname == "img": tattr["width"] = width else: styles.append("text-indent: %s" % self.ensure_unit(width)) if width.startswith("-"): styles.append("margin-left: %s" % self.ensure_unit(width[1:])) if "bgcolor" in tattr.keys(): # no proprietary html allowed if tname == "div": del tattr["bgcolor"] elif tname == "font": # Change font tags to span tags tname = "span" if ttype in ("begin", "single", "single_ext"): # move the face attribute to css font-family if "face" in tattr.keys(): face = tattr.pop("face").strip() styles.append('font-family: "%s"' % face) # Monitor the constantly changing font sizes, change them to ems and move # them to css. The following will work for 'flat' font tags, but nested font tags # will cause things to go wonky. Need to revert to the parent font tag's size # when a closing tag is encountered. if "size" in tattr.keys(): sz = tattr.pop("size").strip().lower() try: float(sz) except ValueError: if sz in size_map.keys(): sz = size_map[sz] else: if sz.startswith("-") or sz.startswith("+"): sz = self.current_font_size + float(sz) if sz > 7: sz = 7 elif sz < 1: sz = 1 sz = str(int(sz)) styles.append("font-size: %s" % size_to_em_map[sz]) self.current_font_size = int(sz) elif tname == "img": for attr in ("width", "height"): if attr in tattr: val = tattr[attr] if val.lower().endswith("em"): try: nval = float(val[:-2]) nval *= 16 * ( 168.451 / 72 ) # Assume this was set using the Kindle profile tattr[attr] = "%dpx" % int(nval) except: del tattr[attr] elif val.lower().endswith("%"): del tattr[attr] # convert the anchor tags if "filepos-id" in tattr: tattr["id"] = tattr.pop("filepos-id") if "name" in tattr and tattr["name"] != tattr["id"]: tattr["name"] = tattr["id"] if "filepos" in tattr: filepos = tattr.pop("filepos") try: tattr["href"] = "#filepos%d" % int(filepos) except ValueError: pass if styles: ncls = None rule = "; ".join(styles) for sel, srule in self.tag_css_rules.items(): if srule == rule: ncls = sel break if ncls is None: self.tag_css_rule_cnt += 1 ncls = "rule_%d" % self.tag_css_rule_cnt self.tag_css_rules[ncls] = rule cls = tattr.get("class", "") cls = cls + (" " if cls else "") + ncls tattr["class"] = cls # convert updated tag back to string representation if len(tattr) == 0: tattr = None taginfo = (ttype, tname, tattr) return self.taginfo_tostring(taginfo) """ main only left in for testing outside of plugin """ def main(argv=sys.argv): if len(argv) != 2: return 1 else: infile = argv[1] try: print "Converting Mobi Markup Language to XHTML" mlc = MobiMLConverter(infile) print "Processing ..." htmlstr, css, cssname = mlc.processml() outname = infile.rsplit(".", 1)[0] + "_converted.html" file(outname, "wb").write(htmlstr) file(cssname, "wb").write(css) print "Completed" print "XHTML version of book can be found at: " + outname except ValueError, e: print "Error: %s" % e return 1 return 0 if __name__ == "__main__": sys.exit(main())