#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# this program works in concert with the output from KindleUnpack
"""
Convert from Mobi ML to XHTML
"""
import os
import sys
import re
SPECIAL_HANDLING_TAGS = {
"?xml": ("xmlheader", -1),
"!--": ("comment", -3),
"!DOCTYPE": ("doctype", -1),
}
SPECIAL_HANDLING_TYPES = ["xmlheader", "doctype", "comment"]
SELF_CLOSING_TAGS = [
"br",
"hr",
"input",
"img",
"image",
"meta",
"spacer",
"link",
"frame",
"base",
"col",
"reference",
]
class MobiMLConverter(object):
PAGE_BREAK_PAT = re.compile(r"(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+", re.IGNORECASE)
IMAGE_ATTRS = ("lowrecindex", "recindex", "hirecindex")
def __init__(self, filename):
self.base_css_rules = "blockquote { margin: 0em 0em 0em 1.25em }\n"
self.base_css_rules += "p { margin: 0em }\n"
self.base_css_rules += ".bold { font-weight: bold }\n"
self.base_css_rules += ".italic { font-style: italic }\n"
self.base_css_rules += (
".mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n"
)
self.tag_css_rules = {}
self.tag_css_rule_cnt = 0
self.path = []
self.filename = filename
self.wipml = open(self.filename, "rb").read()
self.pos = 0
self.opfname = self.filename.rsplit(".", 1)[0] + ".opf"
self.opos = 0
self.meta = ""
self.cssname = os.path.join(os.path.dirname(self.filename), "styles.css")
self.current_font_size = 3
self.font_history = []
def cleanup_html(self):
self.wipml = re.sub(
r'
', "", self.wipml
)
self.wipml = self.wipml.replace("\r\n", "\n")
self.wipml = self.wipml.replace("> <", ">\n<")
self.wipml = self.wipml.replace("]*>', '', self.wipml)
self.wipml = self.wipml.replace("
", "
")
def replace_page_breaks(self):
self.wipml = self.PAGE_BREAK_PAT.sub(
'', self.wipml
)
# parse leading text of ml and tag
def parseml(self):
p = self.pos
if p >= len(self.wipml):
return None
if self.wipml[p] != "<":
res = self.wipml.find("<", p)
if res == -1:
res = len(self.wipml)
self.pos = res
return self.wipml[p:res], None
# handle comment as a special case to deal with multi-line comments
if self.wipml[p : p + 4] == "", p + 1)
if te != -1:
te = te + 2
else:
te = self.wipml.find(">", p + 1)
ntb = self.wipml.find("<", p + 1)
if ntb != -1 and ntb < te:
self.pos = ntb
return self.wipml[p:ntb], None
self.pos = te + 1
return None, self.wipml[p : te + 1]
# parses string version of tag to identify its name,
# its type 'begin', 'end' or 'single',
# plus build a hashtable of its attributes
# code is written to handle the possiblity of very poor formating
def parsetag(self, s):
p = 1
# get the tag name
tname = None
ttype = None
tattr = {}
while s[p : p + 1] == " ":
p += 1
if s[p : p + 1] == "/":
ttype = "end"
p += 1
while s[p : p + 1] == " ":
p += 1
b = p
while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"):
p += 1
tname = s[b:p].lower()
if tname == "!doctype":
tname = "!DOCTYPE"
# special cases
if tname in SPECIAL_HANDLING_TAGS.keys():
ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
tattr["special"] = s[p:backstep]
if ttype is None:
# parse any attributes
while s.find("=", p) != -1:
while s[p : p + 1] == " ":
p += 1
b = p
while s[p : p + 1] != "=":
p += 1
aname = s[b:p].lower()
aname = aname.rstrip(" ")
p += 1
while s[p : p + 1] == " ":
p += 1
if s[p : p + 1] in ('"', "'"):
p = p + 1
b = p
while s[p : p + 1] not in ('"', "'"):
p += 1
val = s[b:p]
p += 1
else:
b = p
while s[p : p + 1] not in (">", "/", " "):
p += 1
val = s[b:p]
tattr[aname] = val
# label beginning and single tags
if ttype is None:
ttype = "begin"
if s.find(" /", p) >= 0:
ttype = "single_ext"
elif s.find("/", p) >= 0:
ttype = "single"
return ttype, tname, tattr
# main routine to convert from mobi markup language to html
def processml(self):
# are these really needed
html_done = False
head_done = False
body_done = False
skip = False
htmlstr = ""
self.replace_page_breaks()
self.cleanup_html()
# now parse the cleaned up ml into standard xhtml
while True:
r = self.parseml()
if not r:
break
text, tag = r
if text:
if not skip:
htmlstr += text
if tag:
ttype, tname, tattr = self.parsetag(tag)
# If we run into a DTD or xml declarations inside the body ... bail.
if (
tname in SPECIAL_HANDLING_TAGS.keys()
and tname != "comment"
and body_done
):
htmlstr += "\n