kindle manager
This commit is contained in:
585
mobimaster/mobi/mobiml2xhtml.py
Executable file
585
mobimaster/mobi/mobiml2xhtml.py
Executable file
@@ -0,0 +1,585 @@
|
||||
#! /usr/bin/python
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
|
||||
# this program works in concert with the output from KindleUnpack
|
||||
|
||||
"""
|
||||
Convert from Mobi ML to XHTML
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
|
||||
SPECIAL_HANDLING_TAGS = {
|
||||
"?xml": ("xmlheader", -1),
|
||||
"!--": ("comment", -3),
|
||||
"!DOCTYPE": ("doctype", -1),
|
||||
}
|
||||
|
||||
SPECIAL_HANDLING_TYPES = ["xmlheader", "doctype", "comment"]
|
||||
|
||||
SELF_CLOSING_TAGS = [
|
||||
"br",
|
||||
"hr",
|
||||
"input",
|
||||
"img",
|
||||
"image",
|
||||
"meta",
|
||||
"spacer",
|
||||
"link",
|
||||
"frame",
|
||||
"base",
|
||||
"col",
|
||||
"reference",
|
||||
]
|
||||
|
||||
|
||||
class MobiMLConverter(object):
|
||||
|
||||
PAGE_BREAK_PAT = re.compile(r"(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+", re.IGNORECASE)
|
||||
IMAGE_ATTRS = ("lowrecindex", "recindex", "hirecindex")
|
||||
|
||||
def __init__(self, filename):
|
||||
self.base_css_rules = "blockquote { margin: 0em 0em 0em 1.25em }\n"
|
||||
self.base_css_rules += "p { margin: 0em }\n"
|
||||
self.base_css_rules += ".bold { font-weight: bold }\n"
|
||||
self.base_css_rules += ".italic { font-style: italic }\n"
|
||||
self.base_css_rules += (
|
||||
".mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n"
|
||||
)
|
||||
self.tag_css_rules = {}
|
||||
self.tag_css_rule_cnt = 0
|
||||
self.path = []
|
||||
self.filename = filename
|
||||
self.wipml = open(self.filename, "rb").read()
|
||||
self.pos = 0
|
||||
self.opfname = self.filename.rsplit(".", 1)[0] + ".opf"
|
||||
self.opos = 0
|
||||
self.meta = ""
|
||||
self.cssname = os.path.join(os.path.dirname(self.filename), "styles.css")
|
||||
self.current_font_size = 3
|
||||
self.font_history = []
|
||||
|
||||
def cleanup_html(self):
|
||||
self.wipml = re.sub(
|
||||
r'<div height="0(pt|px|ex|em|%){0,1}"></div>', "", self.wipml
|
||||
)
|
||||
self.wipml = self.wipml.replace("\r\n", "\n")
|
||||
self.wipml = self.wipml.replace("> <", ">\n<")
|
||||
self.wipml = self.wipml.replace("<mbp: ", "<mbp:")
|
||||
# self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
|
||||
self.wipml = self.wipml.replace("<br></br>", "<br/>")
|
||||
|
||||
def replace_page_breaks(self):
|
||||
self.wipml = self.PAGE_BREAK_PAT.sub(
|
||||
'<div class="mbp_pagebreak" />', self.wipml
|
||||
)
|
||||
|
||||
# parse leading text of ml and tag
|
||||
def parseml(self):
|
||||
p = self.pos
|
||||
if p >= len(self.wipml):
|
||||
return None
|
||||
if self.wipml[p] != "<":
|
||||
res = self.wipml.find("<", p)
|
||||
if res == -1:
|
||||
res = len(self.wipml)
|
||||
self.pos = res
|
||||
return self.wipml[p:res], None
|
||||
# handle comment as a special case to deal with multi-line comments
|
||||
if self.wipml[p : p + 4] == "<!--":
|
||||
te = self.wipml.find("-->", p + 1)
|
||||
if te != -1:
|
||||
te = te + 2
|
||||
else:
|
||||
te = self.wipml.find(">", p + 1)
|
||||
ntb = self.wipml.find("<", p + 1)
|
||||
if ntb != -1 and ntb < te:
|
||||
self.pos = ntb
|
||||
return self.wipml[p:ntb], None
|
||||
self.pos = te + 1
|
||||
return None, self.wipml[p : te + 1]
|
||||
|
||||
# parses string version of tag to identify its name,
|
||||
# its type 'begin', 'end' or 'single',
|
||||
# plus build a hashtable of its attributes
|
||||
# code is written to handle the possiblity of very poor formating
|
||||
def parsetag(self, s):
|
||||
p = 1
|
||||
# get the tag name
|
||||
tname = None
|
||||
ttype = None
|
||||
tattr = {}
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
if s[p : p + 1] == "/":
|
||||
ttype = "end"
|
||||
p += 1
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
b = p
|
||||
while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"):
|
||||
p += 1
|
||||
tname = s[b:p].lower()
|
||||
if tname == "!doctype":
|
||||
tname = "!DOCTYPE"
|
||||
# special cases
|
||||
if tname in SPECIAL_HANDLING_TAGS.keys():
|
||||
ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
|
||||
tattr["special"] = s[p:backstep]
|
||||
if ttype is None:
|
||||
# parse any attributes
|
||||
while s.find("=", p) != -1:
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
b = p
|
||||
while s[p : p + 1] != "=":
|
||||
p += 1
|
||||
aname = s[b:p].lower()
|
||||
aname = aname.rstrip(" ")
|
||||
p += 1
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
if s[p : p + 1] in ('"', "'"):
|
||||
p = p + 1
|
||||
b = p
|
||||
while s[p : p + 1] not in ('"', "'"):
|
||||
p += 1
|
||||
val = s[b:p]
|
||||
p += 1
|
||||
else:
|
||||
b = p
|
||||
while s[p : p + 1] not in (">", "/", " "):
|
||||
p += 1
|
||||
val = s[b:p]
|
||||
tattr[aname] = val
|
||||
# label beginning and single tags
|
||||
if ttype is None:
|
||||
ttype = "begin"
|
||||
if s.find(" /", p) >= 0:
|
||||
ttype = "single_ext"
|
||||
elif s.find("/", p) >= 0:
|
||||
ttype = "single"
|
||||
return ttype, tname, tattr
|
||||
|
||||
# main routine to convert from mobi markup language to html
|
||||
def processml(self):
|
||||
|
||||
# are these really needed
|
||||
html_done = False
|
||||
head_done = False
|
||||
body_done = False
|
||||
|
||||
skip = False
|
||||
|
||||
htmlstr = ""
|
||||
self.replace_page_breaks()
|
||||
self.cleanup_html()
|
||||
|
||||
# now parse the cleaned up ml into standard xhtml
|
||||
while True:
|
||||
|
||||
r = self.parseml()
|
||||
if not r:
|
||||
break
|
||||
|
||||
text, tag = r
|
||||
|
||||
if text:
|
||||
if not skip:
|
||||
htmlstr += text
|
||||
|
||||
if tag:
|
||||
ttype, tname, tattr = self.parsetag(tag)
|
||||
|
||||
# If we run into a DTD or xml declarations inside the body ... bail.
|
||||
if (
|
||||
tname in SPECIAL_HANDLING_TAGS.keys()
|
||||
and tname != "comment"
|
||||
and body_done
|
||||
):
|
||||
htmlstr += "\n</body></html>"
|
||||
break
|
||||
|
||||
# make sure self-closing tags actually self-close
|
||||
if ttype == "begin" and tname in SELF_CLOSING_TAGS:
|
||||
ttype = "single"
|
||||
|
||||
# make sure any end tags of self-closing tags are discarded
|
||||
if ttype == "end" and tname in SELF_CLOSING_TAGS:
|
||||
continue
|
||||
|
||||
# remove embedded guide and refernces from old mobis
|
||||
if tname in ("guide", "ncx", "reference") and ttype in (
|
||||
"begin",
|
||||
"single",
|
||||
"single_ext",
|
||||
):
|
||||
tname = "removeme:{0}".format(tname)
|
||||
tattr = None
|
||||
if (
|
||||
tname in ("guide", "ncx", "reference", "font", "span")
|
||||
and ttype == "end"
|
||||
):
|
||||
if self.path[-1] == "removeme:{0}".format(tname):
|
||||
tname = "removeme:{0}".format(tname)
|
||||
tattr = None
|
||||
|
||||
# Get rid of font tags that only have a color attribute.
|
||||
if tname == "font" and ttype in ("begin", "single", "single_ext"):
|
||||
if "color" in tattr.keys() and len(tattr.keys()) == 1:
|
||||
tname = "removeme:{0}".format(tname)
|
||||
tattr = None
|
||||
|
||||
# Get rid of empty spans in the markup.
|
||||
if (
|
||||
tname == "span"
|
||||
and ttype in ("begin", "single", "single_ext")
|
||||
and not len(tattr)
|
||||
):
|
||||
tname = "removeme:{0}".format(tname)
|
||||
|
||||
# need to handle fonts outside of the normal methods
|
||||
# so fonts tags won't be added to the self.path since we keep track
|
||||
# of font tags separately with self.font_history
|
||||
if tname == "font" and ttype == "begin":
|
||||
# check for nested font start tags
|
||||
if len(self.font_history) > 0:
|
||||
# inject a font end tag
|
||||
taginfo = ("end", "font", None)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
self.font_history.append((ttype, tname, tattr))
|
||||
# handle the current font start tag
|
||||
taginfo = (ttype, tname, tattr)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
continue
|
||||
|
||||
# check for nested font tags and unnest them
|
||||
if tname == "font" and ttype == "end":
|
||||
self.font_history.pop()
|
||||
# handle this font end tag
|
||||
taginfo = ("end", "font", None)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
# check if we were nested
|
||||
if len(self.font_history) > 0:
|
||||
# inject a copy of the most recent font start tag from history
|
||||
taginfo = self.font_history[-1]
|
||||
htmlstr += self.processtag(taginfo)
|
||||
continue
|
||||
|
||||
# keep track of nesting path
|
||||
if ttype == "begin":
|
||||
self.path.append(tname)
|
||||
elif ttype == "end":
|
||||
if tname != self.path[-1]:
|
||||
print ("improper nesting: ", self.path, tname, ttype)
|
||||
if tname not in self.path:
|
||||
# handle case of end tag with no beginning by injecting empty begin tag
|
||||
taginfo = ("begin", tname, None)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
print " - fixed by injecting empty start tag ", tname
|
||||
self.path.append(tname)
|
||||
elif len(self.path) > 1 and tname == self.path[-2]:
|
||||
# handle case of dangling missing end
|
||||
taginfo = ("end", self.path[-1], None)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
print " - fixed by injecting end tag ", self.path[-1]
|
||||
self.path.pop()
|
||||
self.path.pop()
|
||||
|
||||
if tname == "removeme:{0}".format(tname):
|
||||
if ttype in ("begin", "single", "single_ext"):
|
||||
skip = True
|
||||
else:
|
||||
skip = False
|
||||
else:
|
||||
taginfo = (ttype, tname, tattr)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
|
||||
# handle potential issue of multiple html, head, and body sections
|
||||
if tname == "html" and ttype == "begin" and not html_done:
|
||||
htmlstr += "\n"
|
||||
html_done = True
|
||||
|
||||
if tname == "head" and ttype == "begin" and not head_done:
|
||||
htmlstr += "\n"
|
||||
# also add in metadata and style link tags
|
||||
htmlstr += self.meta
|
||||
htmlstr += (
|
||||
'<link href="styles.css" rel="stylesheet" type="text/css" />\n'
|
||||
)
|
||||
head_done = True
|
||||
|
||||
if tname == "body" and ttype == "begin" and not body_done:
|
||||
htmlstr += "\n"
|
||||
body_done = True
|
||||
|
||||
# handle issue of possibly missing html, head, and body tags
|
||||
# I have not seen this but the original did something like this so ...
|
||||
if not body_done:
|
||||
htmlstr = "<body>\n" + htmlstr + "</body>\n"
|
||||
if not head_done:
|
||||
headstr = "<head>\n"
|
||||
headstr += self.meta
|
||||
headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
|
||||
headstr += "</head>\n"
|
||||
htmlstr = headstr + htmlstr
|
||||
if not html_done:
|
||||
htmlstr = "<html>\n" + htmlstr + "</html>\n"
|
||||
|
||||
# finally add DOCTYPE info
|
||||
htmlstr = (
|
||||
'<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
+ htmlstr
|
||||
)
|
||||
|
||||
css = self.base_css_rules
|
||||
for cls, rule in self.tag_css_rules.items():
|
||||
css += ".%s { %s }\n" % (cls, rule)
|
||||
|
||||
return (htmlstr, css, self.cssname)
|
||||
|
||||
def ensure_unit(self, raw, unit="px"):
|
||||
if re.search(r"\d+$", raw) is not None:
|
||||
raw += unit
|
||||
return raw
|
||||
|
||||
# flatten possibly modified tag back to string
|
||||
def taginfo_tostring(self, taginfo):
|
||||
(ttype, tname, tattr) = taginfo
|
||||
if ttype is None or tname is None:
|
||||
return ""
|
||||
if ttype == "end":
|
||||
return "</%s>" % tname
|
||||
if (
|
||||
ttype in SPECIAL_HANDLING_TYPES
|
||||
and tattr is not None
|
||||
and "special" in tattr.keys()
|
||||
):
|
||||
info = tattr["special"]
|
||||
if ttype == "comment":
|
||||
return "<%s %s-->" % tname, info
|
||||
else:
|
||||
return "<%s %s>" % tname, info
|
||||
res = []
|
||||
res.append("<%s" % tname)
|
||||
if tattr is not None:
|
||||
for key in tattr.keys():
|
||||
res.append(' %s="%s"' % (key, tattr[key]))
|
||||
if ttype == "single":
|
||||
res.append("/>")
|
||||
elif ttype == "single_ext":
|
||||
res.append(" />")
|
||||
else:
|
||||
res.append(">")
|
||||
return "".join(res)
|
||||
|
||||
# routines to convert from mobi ml tags atributes to xhtml attributes and styles
|
||||
def processtag(self, taginfo):
|
||||
# Converting mobi font sizes to numerics
|
||||
size_map = {
|
||||
"xx-small": "1",
|
||||
"x-small": "2",
|
||||
"small": "3",
|
||||
"medium": "4",
|
||||
"large": "5",
|
||||
"x-large": "6",
|
||||
"xx-large": "7",
|
||||
}
|
||||
|
||||
size_to_em_map = {
|
||||
"1": ".65em",
|
||||
"2": ".75em",
|
||||
"3": "1em",
|
||||
"4": "1.125em",
|
||||
"5": "1.25em",
|
||||
"6": "1.5em",
|
||||
"7": "2em",
|
||||
}
|
||||
|
||||
# current tag to work on
|
||||
(ttype, tname, tattr) = taginfo
|
||||
if not tattr:
|
||||
tattr = {}
|
||||
|
||||
styles = []
|
||||
|
||||
if tname is None or tname.startswith("removeme"):
|
||||
return ""
|
||||
|
||||
# have not seen an example of this yet so keep it here to be safe
|
||||
# until this is better understood
|
||||
if tname in (
|
||||
"country-region",
|
||||
"place",
|
||||
"placetype",
|
||||
"placename",
|
||||
"state",
|
||||
"city",
|
||||
"street",
|
||||
"address",
|
||||
"content",
|
||||
):
|
||||
tname = "div" if tname == "content" else "span"
|
||||
for key in tattr.keys():
|
||||
tattr.pop(key)
|
||||
|
||||
# handle general case of style, height, width, bgcolor in any tag
|
||||
if "style" in tattr.keys():
|
||||
style = tattr.pop("style").strip()
|
||||
if style:
|
||||
styles.append(style)
|
||||
|
||||
if "align" in tattr.keys():
|
||||
align = tattr.pop("align").strip()
|
||||
if align:
|
||||
if tname in ("table", "td", "tr"):
|
||||
pass
|
||||
else:
|
||||
styles.append("text-align: %s" % align)
|
||||
|
||||
if "height" in tattr.keys():
|
||||
height = tattr.pop("height").strip()
|
||||
if (
|
||||
height
|
||||
and "<" not in height
|
||||
and ">" not in height
|
||||
and re.search(r"\d+", height)
|
||||
):
|
||||
if tname in ("table", "td", "tr"):
|
||||
pass
|
||||
elif tname == "img":
|
||||
tattr["height"] = height
|
||||
else:
|
||||
styles.append("margin-top: %s" % self.ensure_unit(height))
|
||||
|
||||
if "width" in tattr.keys():
|
||||
width = tattr.pop("width").strip()
|
||||
if width and re.search(r"\d+", width):
|
||||
if tname in ("table", "td", "tr"):
|
||||
pass
|
||||
elif tname == "img":
|
||||
tattr["width"] = width
|
||||
else:
|
||||
styles.append("text-indent: %s" % self.ensure_unit(width))
|
||||
if width.startswith("-"):
|
||||
styles.append("margin-left: %s" % self.ensure_unit(width[1:]))
|
||||
|
||||
if "bgcolor" in tattr.keys():
|
||||
# no proprietary html allowed
|
||||
if tname == "div":
|
||||
del tattr["bgcolor"]
|
||||
|
||||
elif tname == "font":
|
||||
# Change font tags to span tags
|
||||
tname = "span"
|
||||
if ttype in ("begin", "single", "single_ext"):
|
||||
# move the face attribute to css font-family
|
||||
if "face" in tattr.keys():
|
||||
face = tattr.pop("face").strip()
|
||||
styles.append('font-family: "%s"' % face)
|
||||
|
||||
# Monitor the constantly changing font sizes, change them to ems and move
|
||||
# them to css. The following will work for 'flat' font tags, but nested font tags
|
||||
# will cause things to go wonky. Need to revert to the parent font tag's size
|
||||
# when a closing tag is encountered.
|
||||
if "size" in tattr.keys():
|
||||
sz = tattr.pop("size").strip().lower()
|
||||
try:
|
||||
float(sz)
|
||||
except ValueError:
|
||||
if sz in size_map.keys():
|
||||
sz = size_map[sz]
|
||||
else:
|
||||
if sz.startswith("-") or sz.startswith("+"):
|
||||
sz = self.current_font_size + float(sz)
|
||||
if sz > 7:
|
||||
sz = 7
|
||||
elif sz < 1:
|
||||
sz = 1
|
||||
sz = str(int(sz))
|
||||
styles.append("font-size: %s" % size_to_em_map[sz])
|
||||
self.current_font_size = int(sz)
|
||||
|
||||
elif tname == "img":
|
||||
for attr in ("width", "height"):
|
||||
if attr in tattr:
|
||||
val = tattr[attr]
|
||||
if val.lower().endswith("em"):
|
||||
try:
|
||||
nval = float(val[:-2])
|
||||
nval *= 16 * (
|
||||
168.451 / 72
|
||||
) # Assume this was set using the Kindle profile
|
||||
tattr[attr] = "%dpx" % int(nval)
|
||||
except:
|
||||
del tattr[attr]
|
||||
elif val.lower().endswith("%"):
|
||||
del tattr[attr]
|
||||
|
||||
# convert the anchor tags
|
||||
if "filepos-id" in tattr:
|
||||
tattr["id"] = tattr.pop("filepos-id")
|
||||
if "name" in tattr and tattr["name"] != tattr["id"]:
|
||||
tattr["name"] = tattr["id"]
|
||||
|
||||
if "filepos" in tattr:
|
||||
filepos = tattr.pop("filepos")
|
||||
try:
|
||||
tattr["href"] = "#filepos%d" % int(filepos)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if styles:
|
||||
ncls = None
|
||||
rule = "; ".join(styles)
|
||||
for sel, srule in self.tag_css_rules.items():
|
||||
if srule == rule:
|
||||
ncls = sel
|
||||
break
|
||||
if ncls is None:
|
||||
self.tag_css_rule_cnt += 1
|
||||
ncls = "rule_%d" % self.tag_css_rule_cnt
|
||||
self.tag_css_rules[ncls] = rule
|
||||
cls = tattr.get("class", "")
|
||||
cls = cls + (" " if cls else "") + ncls
|
||||
tattr["class"] = cls
|
||||
|
||||
# convert updated tag back to string representation
|
||||
if len(tattr) == 0:
|
||||
tattr = None
|
||||
taginfo = (ttype, tname, tattr)
|
||||
return self.taginfo_tostring(taginfo)
|
||||
|
||||
|
||||
""" main only left in for testing outside of plugin """
|
||||
|
||||
|
||||
def main(argv=sys.argv):
|
||||
if len(argv) != 2:
|
||||
return 1
|
||||
else:
|
||||
infile = argv[1]
|
||||
|
||||
try:
|
||||
print "Converting Mobi Markup Language to XHTML"
|
||||
mlc = MobiMLConverter(infile)
|
||||
print "Processing ..."
|
||||
htmlstr, css, cssname = mlc.processml()
|
||||
outname = infile.rsplit(".", 1)[0] + "_converted.html"
|
||||
file(outname, "wb").write(htmlstr)
|
||||
file(cssname, "wb").write(css)
|
||||
print "Completed"
|
||||
print "XHTML version of book can be found at: " + outname
|
||||
|
||||
except ValueError, e:
|
||||
print "Error: %s" % e
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user