kindle manager

2024-04-03 15:08:22 +08:00
parent 6b3c0f3b6b
commit 6df3ce42a3
459 changed files with 164651 additions and 4690 deletions
--- a/mobiparse/mobi/init.py
+++ b/mobiparse/mobi/init.py
@@ -0,0 +1,7 @@
+import os
+
+os.environ["LOGURU_AUTOINIT"] = "False"
+from mobi.extract import extract
+from mobi.extract import extracttest
+
+__version__ = "0.3.1"
--- a/mobiparse/mobi/compatibility_utils.py
+++ b/mobiparse/mobi/compatibility_utils.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list
+# of conditions and the following disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import sys
+import codecs
+
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] == 3
+
+iswindows = sys.platform.startswith("win")
+
+try:
+    from urllib.parse import unquote
+except ImportError:
+    from urllib import unquote
+
+if PY2:
+    from HTMLParser import HTMLParser
+
+    _h = HTMLParser()
+elif sys.version_info[1] < 4:
+    import html.parser
+
+    _h = html.parser.HTMLParser()
+else:
+    import html as _h
+
+if PY3:
+    text_type = str
+    binary_type = bytes
+    # if will be printing arbitraty binary data to stdout on python 3
+    # sys.stdin = sys.stdin.detach()
+    # sys.stdout = sys.stdout.detach()
+    # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
+else:
+    range = xrange
+    text_type = unicode
+    binary_type = str
+    # if will be printing unicode under python 2 need to protect
+    # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
+    # sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
+    # alternatively set environment variable as follows **before** launching python:  export PYTHONIOENCODING=UTF-8
+
+# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
+# (and they amazingly claim by design and no bug!)
+
+# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
+# >>> o = '123456789'
+# >>> o[-3]
+# '7'
+# >>> type(o[-3])
+# <class 'str'>
+# >>> type(o)
+# <class 'str'>
+
+# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
+# >>> o = b'123456789'
+# >>> o[-3]
+# 55
+# >>> type(o[-3])
+# <class 'int'>
+# >>> type(o)
+# <class 'bytes'>
+
+# This mind boggling  behaviour also happens when indexing a bytestring and/or
+# iteratoring over a bytestring.  In other words it will return an int but not
+# the byte itself!!!!!!!
+
+# The only way to access a single byte as a byte in bytestring and get the byte in both
+# Python 2 and Python 3 is to use a slice
+
+# This problem is so common there are horrible hacks floating around the net to **try**
+# to work around it, so that code that works on both Python 2 and Python 3 is possible.
+
+# So in order to write code that works on both Python 2 and Python 3
+# if you index or access a single byte and want its ord() then use the bord() function.
+# If instead you want it as a single character byte use the bchar() function
+# both of which are defined below.
+
+if PY3:
+    # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
+    # in place of ascii you will get a byte value to half-word or integer value
+    # one-to-one mapping (in the 0 - 255 range)
+
+    def bchr(s):
+        return bytes([s])
+
+    def bstr(s):
+        if isinstance(s, str):
+            return bytes(s, "latin-1")
+        else:
+            return bytes(s)
+
+    def bord(s):
+        return s
+
+    def bchar(s):
+        return bytes([s])
+
+
+else:
+
+    def bchr(s):
+        return chr(s)
+
+    def bstr(s):
+        return str(s)
+
+    def bord(s):
+        return ord(s)
+
+    def bchar(s):
+        return s
+
+
+if PY3:
+    # list-producing versions of the major Python iterating functions
+    def lrange(*args, **kwargs):
+        return list(range(*args, **kwargs))
+
+    def lzip(*args, **kwargs):
+        return list(zip(*args, **kwargs))
+
+    def lmap(*args, **kwargs):
+        return list(map(*args, **kwargs))
+
+    def lfilter(*args, **kwargs):
+        return list(filter(*args, **kwargs))
+
+
+else:
+    import __builtin__
+
+    # Python 2-builtin ranges produce lists
+    lrange = __builtin__.range
+    lzip = __builtin__.zip
+    lmap = __builtin__.map
+    lfilter = __builtin__.filter
+
+# In Python 3 you can no longer use .encode('hex') on a bytestring
+# instead use the following on both platforms
+import binascii
+
+
+def hexlify(bdata):
+    return (binascii.hexlify(bdata)).decode("ascii")
+
+
+# If you: import struct
+# Note:  struct pack, unpack, unpack_from all *require* bytestring format
+# data all the way up to at least Python 2.7.5, Python 3 is okay with either
+
+# If you: import re
+# note: Python 3 "re" requires the pattern to be the exact same type as the data to be
+# searched ... but u"" is not allowed for the pattern itself only b""
+# Python 2.X allows the pattern to be any type and converts it to match the data
+# and returns the same type as the data
+
+# convert string to be utf-8 encoded
+def utf8_str(p, enc="utf-8"):
+    if p is None:
+        return None
+    if isinstance(p, text_type):
+        return p.encode("utf-8")
+    if enc != "utf-8":
+        return p.decode(enc).encode("utf-8")
+    return p
+
+
+# convert string to be unicode encoded
+def unicode_str(p, enc="utf-8"):
+    if p is None:
+        return None
+    if isinstance(p, text_type):
+        return p
+    return p.decode(enc)
+
+
+ASCII_CHARS = set(chr(x) for x in range(128))
+URL_SAFE = set(
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789" "#" "_.-/~"
+)
+IRI_UNSAFE = ASCII_CHARS - URL_SAFE
+
+# returns a quoted IRI (not a URI)
+def quoteurl(href):
+    if isinstance(href, binary_type):
+        href = href.decode("utf-8")
+    result = []
+    for char in href:
+        if char in IRI_UNSAFE:
+            char = "%%%02x" % ord(char)
+        result.append(char)
+    return "".join(result)
+
+
+# unquotes url/iri
+def unquoteurl(href):
+    if isinstance(href, binary_type):
+        href = href.decode("utf-8")
+    href = unquote(href)
+    return href
+
+
+# unescape html
+def unescapeit(sval):
+    return _h.unescape(sval)
+
+
+# Python 2.X commandline parsing under Windows has been horribly broken for years!
+# Use the following code to emulate full unicode commandline parsing on Python 2
+# ie. To get  sys.argv arguments and properly encode them as unicode
+
+
+def unicode_argv():
+    global iswindows
+    global PY3
+    if PY3:
+        return sys.argv
+    if iswindows:
+        # Versions 2.x of Python don't support Unicode in sys.argv on
+        # Windows, with the underlying Windows API instead replacing multi-byte
+        # characters with '?'.  So use shell32.GetCommandLineArgvW to get sys.argv
+        # as a list of Unicode strings
+        from ctypes import POINTER, byref, cdll, c_int, windll
+        from ctypes.wintypes import LPCWSTR, LPWSTR
+
+        GetCommandLineW = cdll.kernel32.GetCommandLineW
+        GetCommandLineW.argtypes = []
+        GetCommandLineW.restype = LPCWSTR
+
+        CommandLineToArgvW = windll.shell32.CommandLineToArgvW
+        CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
+        CommandLineToArgvW.restype = POINTER(LPWSTR)
+
+        cmd = GetCommandLineW()
+        argc = c_int(0)
+        argv = CommandLineToArgvW(cmd, byref(argc))
+        if argc.value > 0:
+            # Remove Python executable and commands if present
+            start = argc.value - len(sys.argv)
+            return [argv[i] for i in range(start, argc.value)]
+        # this should never happen
+        return None
+    else:
+        argv = []
+        argvencoding = sys.stdin.encoding
+        if argvencoding is None:
+            argvencoding = sys.getfilesystemencoding()
+        if argvencoding is None:
+            argvencoding = "utf-8"
+        for arg in sys.argv:
+            if isinstance(arg, text_type):
+                argv.append(arg)
+            else:
+                argv.append(arg.decode(argvencoding))
+        return argv
+
+
+# Python 2.X is broken in that it does not recognize CP65001 as UTF-8
+def add_cp65001_codec():
+    if PY2:
+        try:
+            codecs.lookup("cp65001")
+        except LookupError:
+            codecs.register(
+                lambda name: name == "cp65001" and codecs.lookup("utf-8") or None
+            )
+    return
--- a/mobiparse/mobi/extract.py
+++ b/mobiparse/mobi/extract.py
@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+import shutil
+import json
+import os
+
+from loguru import logger
+import tempfile
+from os.path import basename, splitext, exists, join
+from mobi.kindleunpack import unpackBook
+from mobi.makencx import extractNcx
+
+def extract(infile):
+    """Extract mobi file and return path to epub file"""
+
+    tempdir = tempfile.mkdtemp(prefix="mobiex")
+    if hasattr(infile, "fileno"):
+        tempname = next(tempfile._get_candidate_names()) + ".mobi"
+        pos = infile.tell()
+        infile.seek(0)
+
+        with open(join(tempdir, tempname), "wb") as outfile:
+            shutil.copyfileobj(infile, outfile)
+
+        infile.seek(pos)
+        infile = join(tempdir, tempname)
+
+    logger.debug("file: %s" % infile)
+    fname_in = basename(infile)
+    base, ext = splitext(fname_in)
+    fname_out_epub = base + ".epub"
+    fname_out_html = "book.html"
+    fname_out_pdf = base + ".001.pdf"
+
+    unpackBook(infile, tempdir, epubver="A")
+
+    epub_filepath = join(tempdir, "mobi8", fname_out_epub)
+    html_filepath = join(tempdir, "mobi7", fname_out_html)
+    pdf_filepath = join(tempdir, fname_out_pdf)
+    if exists(epub_filepath):
+        return tempdir, epub_filepath
+    elif exists(html_filepath):
+        return tempdir, html_filepath
+    elif exists(pdf_filepath):
+        return tempdir, pdf_filepath
+    raise ValueError("Coud not extract from %s" % infile)
+
+def extracttest(infile):
+    """Extract mobi file and return path to epub file"""
+
+    tempdir = './t/'
+    if hasattr(infile, "fileno"):
+        tempname = next(tempfile._get_candidate_names()) + ".mobi"
+        pos = infile.tell()
+        infile.seek(0)
+        with open(join(tempdir, tempname), "wb") as outfile:
+            shutil.copyfileobj(infile, outfile)
+        infile.seek(pos)
+        infile = join(tempdir, tempname)
+        # tempname 8x2vf7yv.mobi pos 0 infile ./t/8x2vf7yv.mobi
+        print('tempname {} pos {} infile {}'.format(tempname,pos,infile))
+
+    logger.debug("file: %s" % infile)
+    fname_in = basename(infile)
+    base, ext = splitext(fname_in)
+    fname_out_epub = base + ".epub"
+    fname_out_html = "book.html"
+    fname_out_pdf = base + ".001.pdf"
+
+    # infile ./t/8x2vf7yv.mobi
+    unpackBook(infile, tempdir, epubver="A")
+
+    epub_filepath = join(tempdir, "mobi8", fname_out_epub)
+    html_filepath = join(tempdir, "mobi7", fname_out_html)
+    pdf_filepath = join(tempdir, fname_out_pdf)
+
+    # CGDBG
+    #epub_filepath ./t/mobi8/p302tbwb.epub html_filepath ./t/mobi7/book.html pdf_filepath ./t/p302tbwb.001.pdf
+    print('epub_filepath {} html_filepath {} pdf_filepath {}'.format( epub_filepath, html_filepath, pdf_filepath))
+
+    if exists(epub_filepath):
+        return tempdir, epub_filepath
+    elif exists(html_filepath):
+        return tempdir, html_filepath
+    elif exists(pdf_filepath):
+        return tempdir, pdf_filepath
+    raise ValueError("Coud not extract from %s" % infile)
+
+## CG test extractNcx
+def extract_ncx_test():
+    #infile = "./tests/youxi.mobi"
+    #infile = "./tests/xiaodao.mobi"
+    filelst = [
+            'youxi.mobi',
+            'xiaodao.mobi',
+            'laocan.azw3',
+#           'shikong.kfx',
+            'shisu.azw',
+'ETF全球投资指南（一个账户，投资全球。告诉你用美股ETF投什么、怎么投，低门槛、低成本地进行全球化投资和配_JYT5RZBTKMUUJCV2XZ6RWZFNJWAS3KWP.azw3',
+'正義_ 一場思辨之旅_SVQL2PQHADT6UEVYYMJLNLP3TNVZZ2PO.azw3',
+'赘婿_IFYXK7EVMAJZU6SRDI4G3PF6KG5AA3CF.azw',
+'熊逸·佛学50讲_MW5FG7LWUA3G5ZSRJHOI6YNS4E6EC3AP.azw',
+'哲学·科学·常识_Q7IU43GSXSQ3TZULYN46U5DSTJBOSVTW.azw3',
+'理想国2020年豆瓣高分文学作品集【理想国2020年度最受好评新书】_H5LB5WWKBT55NDVJ44TGWNP3YWUXMGAN.azw',
+'大问题_ 简明哲学导论_3MGHFPCGYUPS5OX6PLI2U2ZN2D4QY6IP.azw',
+'利维坦_BQNVQ3PLMU5NABEUEYPESXWK4KEVXH2V.azw',
+#'马克斯·韦伯作品集（套装6册）【现代社会学奠基人，余英时、苏国勋推荐译本 理想国出品】_LUVR3MEUTXTZLMGRQS7RBEDQ32QB2WDZ.azw',
+'物种起源 (译林人文精选)_EFRUCMHSILFZ7IY75OF6KU3FXKG5DYQS.azw',
+'公式之美 （中国好书，人类最美的23个公式）_JWP4OH34GQTHTM4UB5KVTDDPMV5Q45XW.azw',
+'經濟學人104個大解惑：從紙鈔面額、廣告祕辛，到航空公司如何節省成本的全面揭密_RSN23YIUQOGLI5XMNP4UUWZFFBKW7SWT.azw',
+'快思慢想_6RD4VKANWTU2EIIS5ZQSW4H2WAVW6BKS.azw3',
+'老残游记_GYDUNWUDXOSL6GX376HXF6Y5BZB4BYWK.azw3',
+'美的历程_HRFLXR3SWA7YIMUHHBMOHKIFCXTKWDEN.azw',
+'世俗时代_JAJCAPFHKMBTWHQC3Z3M23MQHVM4LD5H.azw',
+'先知之後_OVMTC2SMQGT6WR5BEPB3NLIHPEXAPOZ2.azw',
+'股市稳赚_TKUOH7HY4AFL6MHGDXOA2JTJMDQMJ2M2.azw',
+'货币野史_TNDZWSPMBH3OU3GA4OONJOOS5N3RML46.azw',
+'黄金时代_UECZKDFEYBBCXD7VEGKUF3JAIZCNQTLD.azw',
+'哲学小引_ZIPCDHPBDNDXGPGFMHVMWTW2T43H4H33.azw',
+'危险的关系(译文名著精选)_LCIX4CJDXKBN44DFWQC6BNQP5T4QUJGQ.azw3',
+'生命是什么-活细胞的物理观_KHTK74YI2ZCNNLTDNO42O5TEUCFY4OAE.azw',
+'时间的边缘_FTWBORVACPNFZEHJGDPP3F5G3ZTP2API.azw3',
+'苏菲的世界_GY2VU27R6NLR5DYLSK2X3SGMDOS6OEHA.azw',
+'责任的重负_IOEVP74VFX7HI7CZLHNVDYUSBR5SKWHP.azw',
+'非常潜力股_RSO2TPJMR4Z5GRVVK27M6CZIOKXDOGFA.azw',
+'超级聊天学_为中国人量身定制的口才实操指南（会聊天，瞬间提升你的魅力指数，和任何人都能说上话，和任何人_UJSIR4LXOYUXNN3VKNM2VEJAZWPWXIA5.azw3',
+'世说新语精读 (汉语言文学原典精读系列)_EIQTMTBSOBUBAZTIO76QY3UE5DHG7QUX.azw',
+'打开：周濂的100堂西方哲学课（一部有营养、有态度，读得懂、读得动的西方哲学史）_NBLLIRUDZVFARGUMVTO3IYM7RZFWAHF5.azw',
+'中国游戏风云_45BZOYKQWCVIMPV5J5TUQB3A4WMSXG6Y.azw3',
+'顾准历史笔记_664YSYONYO3WYLR7KN4EHLAVN5AZYJPJ.azw',
+'克拉拉与太阳_AHT6ZF3TVFNTKQLLAZJHBPWVMXHXP6HC.azw',
+'现代汉英词典_B00771V9HS.azw',
+'现代汉语词典_B00AKJGTAQ.azw',
+'香帅财富报告_US63WJUWKWW2Y7GPKCIDMRVUEYTQVOKM.azw',
+'送你一颗子弹_YIFYW7SIWRQULQ5I7OHSFWW5GWJQPRXF.azw',
+'从沸腾到癫狂_泡沫背后的中国房地产真相_Z67UZLBWFW7E7RXRCMSBQK2X2TQJMV54.azw',
+'诸子百家闪耀时 (豆瓣“大神”林欣浩首部中国哲学史力作)_Z65JN5SBZKB5CXWP4QEJ5N36WBJK7LYG.azw',
+'增广贤文（精）--中华经典名著全本全注全译 (中华书局)_55WRYKJTRRUSR3F7PUHSCT7VCYLNXAO5.azw',
+'吃货的孤单心事_4TSS34PIM3LDOE7STPZLPVI5Z2NJ63ZA.azw3',
+'专业团队的管理_CZQZGKOZ2O4B5657CBJX62YTDTHGFTHI.azw3',
+'《新青年》文选_D63BJHTSQMXJOD3WZMITH5MCXPPX4TQO.azw3',
+'中国改革三步走_YAIFAT7KHQG3FKXYDBGDAODOQB3IXREP.azw3',
+'手把手教你读财报2——18节课看透银行业_N2ULAPFBPTYYROEZR6OXSFRZHWGVPP47.azw',
+'伯罗奔尼撒战争史_7IG7PYJEFQCQZKJFADJQPB63WOJZ457K.azw',
+'股市投资致富之道_DRFDHRKUZOGBOQANCZ7WCXQPO4IFXLZ3.azw',
+'中国人的性格历程_F7WWWMHTDUNPAJPQKPMRQQBUVFVWVO2J.azw',
+'股市投资致富之道_QE5QXSGVXZIU6YKQ37ILICN3Y4KMOHFB.azw',
+'手把手教你读财报_财报是用来排除企业的_BIO354K2A7W6AKDRG672GCTOW256C7W3.azw',
+'不可思议的自然对数 (探秘数学常数)_RXHNPMVRRSAJGPFS4EZMEW6YRIOAUJHL.azw',
+'关键选择【帮助起底27个赚钱的逻辑，经济下行趋势下的个人财富增长方案】_XX73ZPD6RPCZSB3XBKBTZZSC5RKQDXJO.azw',
+'中国法律与中国社会_SH3O32FVYFABLOLPVCAQEG2YFWSLTZKY.azw',
+'印度，漂浮的次大陆_YGTGYKH3CAVXF5QZIORNOU4HY3JWUFT2.azw',
+'关于那个人的备忘录_ZCIQC6KSVBPKAIOJJSASQ4T7CUSC23IO.azw',
+'尼采哲学经典（套装共5册） (李敖力荐台湾经典译本)_OQ6CLDRDV34ZPOF2ZLK6CJTMMDJMLVEW.azw',
+'刘擎西方现代思想讲义_2X4PDLIAHHBVF2JV4WS6SSWMEU6WCC2W.azw3',
+'故事是这个世界的解药_YIRJ7ONGJKHJF4VHEGAHXCRAZRUQWDC5.azw',
+'哲学家们都干了些什么_（2015年全新修订版）_7YZCO42RPEELUVMTCWMXV7VAFJHNUBH3.azw',
+'韩炳哲作品系列（套装共9册）_XQLLBYAIEHIDC6DDKPUSDUCXA5JJDV7E.azw3',
+'異常流行幻象與群眾瘋狂_54WCNGW254RB2VX2QAQOQJCTOIS5UDVZ.azw',
+'通胀来了，你准备好了吗_6EY3XVPBUCU3S2Z6GASBJ3ZFIYG7XAPV.azw3',
+'第二十二条军规：纪念版_XHGD4NJJO7IJJLI6NZQVR7GWGIPZASKU.azw',
+'反智：不願說理的人是偏執 不會說理的人是愚蠢 不敢說理的人是奴隸_FQGEEYA535SOIWBGTZMINNYUUUVOHGCO.azw',
+'中国货币史（校订版）豆瓣9.5分，好评如潮！_5QDPZ4BJCNFCQEW6OOTFT5WGGFEKJVCM.azw',
+'海洋与权力：一部新文明史_DZARAFK3BF3JM2Y6G275TNEHS3YFIY63.azw',
+'蘇東坡新傳（上下合併冊）_MQJ52QLZFG7GJIQXLGO6EOOEWDUPOTGC.azw',
+'刷新：重新发现商业与未来_SJJ2ZXTZYRIYOTA6PRLQNRBILPM6TN3O.azw',
+'禅宗是什么：胡适谈禅说佛_UV6UYZCLIBT7T5XOIZ3XEHLGYP524UUU.azw',
+'可塑的我：自我发展心理学的35堂必修课（自我发展心理学的35堂必修课）_LE5ZTKZITLXRBQVHN2GPNXODGRRZHE7C.azw3',
+'诺贝尔奖经济学合集（套装共5册）（经济学领域的集大成作品）_TD4FJVAQB35P77SCIG3VGERLQZJKG27G.azw',
+'费雪论成长股获利：投资大师80年投资致富的选股方法 (深受普通投资者欢迎与推荐的投资经典，巴菲特之师、“近_MXG3UMBD2MUQN3EEY7VC545U567VQC76.azw',
+'阿含经校注（全九册）【豆瓣9.6高分推荐！线装书局出品！一套书读通佛教“根本佛法”！阿含经专家、苏州西园_4Q3JWY3QBCLYZDBOUFU76KUYXNTXESBB.azw',
+'伤花怒放：摇滚的被缚与抗争_SGVGLBLRNURM5MCXKFKBRLUZS4HLAQOY.azw',
+'区块链：通往资产数字化之路_SHFSC7TLUVLQBYANOLNKJJTWNMHNNVZ5.azw',
+'历史的轨迹——二千年教会史_ZIMUOQ5JGFYGF4SRKFPGX5MEWMC6DPX2.azw',
+'深蓝帝国：英国海军的兴衰（全2册）【《星期日泰晤士报》年度最佳畅销书】 (甲骨文系列)_JRZSCWPDFXTNZRTBCR7HQ6BMXKLJ7QYG.azw3',
+'如何提升专注力：手把手教你用7项改变打造10倍速效率人生_S3NJU6MR5WP3ECFWFQ4NYWHYEB6P5RSZ.azw',
+'性欲和性行为：一种批判理论的99条断想（全2册）【舞蹈家金星、社会学家李银河推荐！继弗洛伊德、福柯之后的_UGCIUTVYKZYV6PDVDQPBM2JCDV7SX6VP.azw',
+'后疫情时代的全球经济与世界秩序(傅莹、蔡昉、江小娟、李扬、余永定、郑永年、迟福林、赵汀阳等20位学者合力_75MHOUFWOONZQOP5CY7E7HMI4P5SZPLT.azw3',
+'何为良好生活：行之于途而应于心_A2ETZLWN3OSZAVWDJOJBB3LTDSFLMHBC.azw',
+'万万没想到：用理工科思维理解世界_57F2POLFMUHUIDHE7KH6Z2Q732VQ5UX3.azw3',
+'徐远的投资课：投资原则与实战方法_7ESQ4POTRSIH4JOOHBL7PSWQ6JT4PLEZ.azw',
+'爆款短视频：如何频繁产出刷屏视频_EWJQBSGVTM2OXQKC5M77MKLFNWGL25CT.azw3',
+'思维的艺术：如何像哲学家一样思考_PSNIXLNDMEXX7T7K5J5NWRDIOFXY3XPX.azw',
+'牛津通识读本：佛学概论（中文版）_TUEO6SWCHMURQIGO3JKQRMZT5DWSYNYP.azw',
+'利益、制度与信息：国内政治与国际关系 (东方编译所译丛)_ESVTYWPTPGMVNMJJW7T5YP2LOQKFSHHY.azw',
+'哲学的故事【让深奥的哲学立刻生动起来!上市首年连续再版22次，迅速译成18种语言，掀起全球哲学热潮】_QCOIVA3QQN2BMYFTZQIXLGD45E2CSUNC.azw',
+'千年贸易战争史——贸易冲突与大国兴衰_ESA55ZHIJ6KVQY3OTCM3N5SB6BHDYKPC.azw3',
+'投资至简：从原点出发构建价值投资体系_GSKBIPYDNJJOAZUMNH4T3PZ2EHYYJPE4.azw3',
+'新金融秩序：如何应对不确定的金融风险（2013年诺贝尔经济学奖得主罗伯特•希勒著作）_YO7GZEODTNSUX4JJR5SRL7MOWP4PZFJ4.azw',
+'超越“街角发言者”：表达权的边缘与中心_P7SJECOJBQOYBJ5D4ZCSTPIFD2ZCMM6C.azw',
+'米开朗琪罗与教皇的天花板（甲骨文系列）_RIR7YAXJJSRRSGEPUU47ULOHDCAKQW7M.azw',
+'诗人十四个：十四位古代诗人和一位现代闯入者 一场始于1600年前的诗歌沙龙_UDDD6AF6HYA4V3SRY6KBN6M677757CCI.azw',
+'叔本华心灵咒语：请优雅地拥抱这个苦难的世界(获得独立的人格，做内心强大的自己！孤独而伟大的哲学家，道破_AZLKLSE2R3KAK5S6WXRYHPTPZLCX4RSO.azw',
+'世界观：现代人必须要懂的科学哲学和科学史（原书第3版）_WI6LHG2VGA6QITWLONOU4KOGNXKGB5EB.azw',
+'野兽绅士（单身男士必备撩妹宝典。坏男孩创始人、教父Tango终极力作！汇集超人气恋爱社区「坏男孩」中超经典_UZKJVFNZXM2C5GHYLRDJABRLKTHZYBFH.azw',
+'怪医笔记【薄世宁、李治中（菠萝）、姬十三、刘润、于莺 真诚力荐！期待值100%的医学题材小说，胸外科医生亲_FXMMP37MN4PRL2TAELQYDDMACRTECSZQ.azw',
+'不拘一格：网飞的自由与责任工作法（网飞官方图书，创始人兼CEO哈斯廷斯重磅作品。一家市值超2000亿美元，全球_3XOUEBICN4XWPCC5FESAMRAQSALMLABE.azw',
+'雅典帝国的覆亡（耶鲁大学教授为你讲述伯罗奔尼撒战争的后十年！）_HU7HIAFMRETK2JTCDVWRDNY35GSHSQGI.azw',
+'无规则游戏：阿富汗屡被中断的历史（本书获“美国北加州图书奖”提名）_SXDQBXUOP5W36IAZFWHXOALY7VS7WCKG.azw',
+'无限记忆力（如何在两周内记住更多知识，改善注意力，并培养出过目不忘的记忆力。 +附：21种实用的记忆力提升_5HCR7CSSQ32XICMV3ZQ5UY4N7AMFDT3S.azw',
+'我是谁，或什么：一部心与自我的辩证奇想集（关于“我”的终极哲学问题，嬉皮年代的思想群峰 本书可能烧掉你_C4HD4VEU4LC54IY4A7I43CK3PONLMNSJ.azw3',
+'高效论证：美国大学最实用的逻辑训练课（全美大学批判性思维经典教材，美国哲学学会重磅推荐）_FDA4AB6DURIL5RT7OZEXM76I2PUW7YCP.azw',
+'一把刀，千个字（茅盾文学奖获奖作家王安忆全新长篇；登顶《收获》长篇小说榜；人民文学出版社倾力打造）_ALQY4IH3LLNYF4KDK7LVXJGOWTYBH4PP.azw',
+'透明社会（以哲学小品文的简练和犀利，照察当今社会情状和人类心灵，洞穿数字媒体时代的群体狂欢和孤独个体_2YTTCLR4QTNFVCWKXZ7GIHEDSNQL6THE.azw',
+'不变与万变：葛剑雄说国史（中国当代历史学家、复旦大学教授葛剑雄重磅新作！全面勾勒中国历史发展的源与流_IOMBENXF5TGWX7F3BJLL322UTV5FDQTZ.azw',
+'下沉年代（白宫之乱的根本原因是什么？拜登上任后美国何去何从？通俗版《美国陷阱》，面对面访谈民主党前幕_KGL4UJC74AOO3HEMV6LRCMRZ5A6YPQGT.azw',
+'一生的读书计划（这一版有实质性的修订和扩充，最突出的变化是推荐的阅读材料的来源范围已经扩展到整个世界_SFGZNMVNLL3RE36GAERWSTDVEWRBZ4MG.azw',
+'三岛由纪夫典藏作品九部（两次入围诺贝尔奖的文学大师三岛由纪夫代表作；日本文学翻译家陈德文先生译本；人_TBNZC7F5EQ5YEKOODF6VMW2I2LBZRD4W.azw',
+            ]
+
+    for fn in filelst:
+        mhdict = extractNcx(os.path.join('./tests',fn))
+        print('process file {} \n {}'.format(fn,
+            json.dumps(mhdict, indent=4, sort_keys=True, ensure_ascii=False)))
+
+if __name__ == "__main__":
+    #print(extracttest("../tests/demo.mobi"))
+    #extract_ncx_test("../tests/demo.mobi")
+    pass
--- a/mobiparse/mobi/kindleunpack.py
+++ b/mobiparse/mobi/kindleunpack.py
--- a/mobiparse/mobi/makencx.py
+++ b/mobiparse/mobi/makencx.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from loguru import logger
+from collections import defaultdict
+
+from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str
+from .compatibility_utils import unicode_argv, add_cp65001_codec
+
+K8_BOUNDARY = b"BOUNDARY"
+""" The section data that divides K8 mobi ebooks. """
+
+class unpackException(Exception):
+    pass
+
+# import the kindleunpack support libraries
+from .unpack_structure import fileNames
+from .mobi_sectioner import Sectionizer
+from .mobi_header import MobiHeader
+from .mobi_ncx import ncxExtract
+
+
+# input mobi file path
+# output ncx dict
+def extractNcx(infile):
+    infile = unicode_str(infile)
+    mhdict = defaultdict(dict)
+
+    # process the PalmDoc database header and verify it is a mobi
+    sect = Sectionizer(infile)
+    if sect.ident != b"BOOKMOBI" and sect.ident != b"TEXtREAd":
+        raise unpackException("Invalid file format")
+
+    logger.debug( "dumppalmheader ...")
+    sect.dumppalmheader()
+
+    # CGDBG
+    print('infile {} '.format(infile))
+    print('sect.dumpsectionsinfo() {}'.format(sect.dumpsectionsinfo()))
+    print('sect.dumppalmheader() {}'.format(sect.dumppalmheader()))
+
+    # scan sections to see if this is a compound mobi file (K8 format)
+    # and build a list of all mobi headers to process.
+    mhlst = []
+
+    # CG mobi header
+    mh = MobiHeader(sect, 0)
+    metadata = mh.getMetaData()
+
+    # if this is a mobi8-only file hasK8 here will be true
+    mhlst.append(mh)
+    K8Boundary = -1
+
+    if mh.isK8():
+        logger.debug("Unpacking a KF8 book...")
+        hasK8 = True
+    else:
+        # CGDBG
+        # This is either a Mobipocket 7 or earlier, or a combi M7/KF8
+        # Find out which
+        hasK8 = False
+        for i in range(len(sect.sectionoffsets) - 1):
+            before, after = sect.sectionoffsets[i : i + 2]
+            if (after - before) == 8:
+                data = sect.loadSection(i)
+                if data == K8_BOUNDARY:
+                    sect.setsectiondescription(i, "Mobi/KF8 Boundary Section")
+                    mh = MobiHeader(sect, i + 1)
+                    hasK8 = True   # K8
+                    mhlst.append(mh)
+                    K8Boundary = i
+                    break
+
+        # hasK8 header information include K8
+        if hasK8:
+            logger.debug( "Unpacking a Combination M{0:d}/KF8 book...".format(mh.version))
+        else:
+            logger.debug("Unpacking a Mobipocket {0:d} book...".format(mh.version))
+
+        # loop for process ncx and write to json with filename - booname.ncx.json
+        for tmh in mhlst:
+            # CG
+            # process the toc ncx
+            # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
+            logger.debug("Processing ncx / toc ")
+            print('hasK8 {} tmh.isK8 {}'.format(hasK8, tmh.isK8()))
+
+            ncx = ncxExtract(tmh)
+            ncx_data = ncx.parseNCX()
+
+            # check the mobi header information is K8 or K7
+            kn = 'k8ncx' if tmh.isK8() else 'k7ncx'
+            mhdict[kn] = ncx_data
+
+        return mhdict
+
--- a/mobiparse/mobi/mobi_cover.py
+++ b/mobiparse/mobi/mobi_cover.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+from .compatibility_utils import unicode_str
+from loguru import logger
+from .unipath import pathof
+import os
+import imghdr
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+USE_SVG_WRAPPER = True
+""" Set to True to use svg wrapper for default. """
+
+FORCE_DEFAULT_TITLE = False
+""" Set to True to force to use the default title. """
+
+COVER_PAGE_FINENAME = "cover_page.xhtml"
+""" The name for the cover page. """
+
+DEFAULT_TITLE = "Cover"
+""" The default title for the cover page. """
+
+MAX_WIDTH = 4096
+""" The max width for the svg cover page. """
+
+MAX_HEIGHT = 4096
+""" The max height for the svg cover page. """
+
+
+def get_image_type(imgname, imgdata=None):
+    imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
+
+    # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
+    # with only the magic JPEG bytes out there...
+    # ImageMagick handles those, so, do it too.
+    if imgtype is None:
+        if imgdata is None:
+            with open(pathof(imgname), "rb") as f:
+                imgdata = f.read()
+        if imgdata[0:2] == b"\xFF\xD8":
+            # Get last non-null bytes
+            last = len(imgdata)
+            while imgdata[last - 1 : last] == b"\x00":
+                last -= 1
+            # Be extra safe, check the trailing bytes, too.
+            if imgdata[last - 2 : last] == b"\xFF\xD9":
+                imgtype = "jpeg"
+    return imgtype
+
+
+def get_image_size(imgname, imgdata=None):
+    """Determine the image type of imgname (or imgdata) and return its size.
+
+    Originally,
+    Determine the image type of fhandle and return its size.
+    from draco"""
+    if imgdata is None:
+        fhandle = open(pathof(imgname), "rb")
+        head = fhandle.read(24)
+    else:
+        head = imgdata[0:24]
+    if len(head) != 24:
+        return
+
+    imgtype = get_image_type(imgname, imgdata)
+    if imgtype == "png":
+        check = struct.unpack(b">i", head[4:8])[0]
+        if check != 0x0D0A1A0A:
+            return
+        width, height = struct.unpack(b">ii", head[16:24])
+    elif imgtype == "gif":
+        width, height = struct.unpack(b"<HH", head[6:10])
+    elif imgtype == "jpeg" and imgdata is None:
+        try:
+            fhandle.seek(0)  # Read 0xff next
+            size = 2
+            ftype = 0
+            while not 0xC0 <= ftype <= 0xCF:
+                fhandle.seek(size, 1)
+                byte = fhandle.read(1)
+                while ord(byte) == 0xFF:
+                    byte = fhandle.read(1)
+                ftype = ord(byte)
+                size = struct.unpack(b">H", fhandle.read(2))[0] - 2
+            # We are at a SOFn block
+            fhandle.seek(1, 1)  # Skip `precision' byte.
+            height, width = struct.unpack(b">HH", fhandle.read(4))
+        except Exception:  # IGNORE:W0703
+            return
+    elif imgtype == "jpeg" and imgdata is not None:
+        try:
+            pos = 0
+            size = 2
+            ftype = 0
+            while not 0xC0 <= ftype <= 0xCF:
+                pos += size
+                byte = imgdata[pos : pos + 1]
+                pos += 1
+                while ord(byte) == 0xFF:
+                    byte = imgdata[pos : pos + 1]
+                    pos += 1
+                ftype = ord(byte)
+                size = struct.unpack(b">H", imgdata[pos : pos + 2])[0] - 2
+                pos += 2
+            # We are at a SOFn block
+            pos += 1  # Skip `precision' byte.
+            height, width = struct.unpack(b">HH", imgdata[pos : pos + 4])
+            pos += 4
+        except Exception:  # IGNORE:W0703
+            return
+    else:
+        return
+    return width, height
+
+
+# XXX experimental
+class CoverProcessor(object):
+
+    """Create a cover page.
+
+    """
+
+    def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
+        self.files = files
+        self.metadata = metadata
+        self.rscnames = rscnames
+        self.cover_page = COVER_PAGE_FINENAME
+        self.use_svg = USE_SVG_WRAPPER  # Use svg wrapper.
+        self.lang = metadata.get("Language", ["en"])[0]
+        # This should ensure that if the methods to find the cover image's
+        # dimensions should fail for any reason, the SVG routine will not be used.
+        [self.width, self.height] = (-1, -1)
+        if FORCE_DEFAULT_TITLE:
+            self.title = DEFAULT_TITLE
+        else:
+            self.title = metadata.get("Title", [DEFAULT_TITLE])[0]
+
+        self.cover_image = None
+        if imgname is not None:
+            self.cover_image = imgname
+        elif "CoverOffset" in metadata:
+            imageNumber = int(metadata["CoverOffset"][0])
+            cover_image = self.rscnames[imageNumber]
+            if cover_image is not None:
+                self.cover_image = cover_image
+            else:
+                logger.debug("Warning: Cannot identify the cover image.")
+        if self.use_svg:
+            try:
+                if imgdata is None:
+                    fname = os.path.join(files.imgdir, self.cover_image)
+                    [self.width, self.height] = get_image_size(fname)
+                else:
+                    [self.width, self.height] = get_image_size(None, imgdata)
+            except:
+                self.use_svg = False
+            width = self.width
+            height = self.height
+            if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
+                self.use_svg = False
+        return
+
+    def getImageName(self):
+        return self.cover_image
+
+    def getXHTMLName(self):
+        return self.cover_page
+
+    def buildXHTML(self):
+        logger.debug("Building a cover page.")
+        files = self.files
+        cover_image = self.cover_image
+        title = self.title
+        lang = self.lang
+
+        image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
+        image_path = os.path.join(image_dir, cover_image).replace("\\", "/")
+
+        if not self.use_svg:
+            data = ""
+            data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
+            data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"'
+            data += ' xml:lang="{:s}">\n'.format(lang)
+            data += "<head>\n<title>{:s}</title>\n".format(title)
+            data += '<style type="text/css">\n'
+            data += "body {\n  margin: 0;\n  padding: 0;\n  text-align: center;\n}\n"
+            data += "div {\n  height: 100%;\n  width: 100%;\n  text-align: center;\n  page-break-inside: avoid;\n}\n"
+            data += "img {\n  display: inline-block;\n  height: 100%;\n  margin: 0 auto;\n}\n"
+            data += "</style>\n</head>\n"
+            data += "<body><div>\n"
+            data += '  <img src="{:s}" alt=""/>\n'.format(image_path)
+            data += "</div></body>\n</html>"
+        else:
+            width = self.width
+            height = self.height
+            viewBox = "0 0 {0:d} {1:d}".format(width, height)
+
+            data = ""
+            data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
+            data += '<html xmlns="http://www.w3.org/1999/xhtml"'
+            data += ' xml:lang="{:s}">\n'.format(lang)
+            data += "<head>\n  <title>{:s}</title>\n".format(title)
+            data += '<style type="text/css">\n'
+            data += "svg {padding: 0pt; margin:0pt}\n"
+            data += "body { text-align: center; padding:0pt; margin: 0pt; }\n"
+            data += "</style>\n</head>\n"
+            data += "<body>\n  <div>\n"
+            data += '    <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"'
+            data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(
+                viewBox
+            )
+            data += '      <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(
+                height, width, image_path
+            )
+            data += "    </svg>\n"
+            data += "  </div>\n</body>\n</html>"
+        return data
+
+    def writeXHTML(self):
+        files = self.files
+        cover_page = self.cover_page
+
+        data = self.buildXHTML()
+
+        outfile = os.path.join(files.k8text, cover_page)
+        if os.path.exists(pathof(outfile)):
+            logger.debug("Warning: {:s} already exists.".format(cover_page))
+            os.remove(pathof(outfile))
+        with open(pathof(outfile), "wb") as f:
+            f.write(data.encode("utf-8"))
+        return
+
+    def guide_toxml(self):
+        files = self.files
+        text_dir = os.path.relpath(files.k8text, files.k8oebps)
+        data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format(
+            text_dir, self.cover_page
+        )
+        return data
--- a/mobiparse/mobi/mobi_dict.py
+++ b/mobiparse/mobi/mobi_dict.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
+from loguru import logger
+
+if PY2:
+    range = xrange
+    array_format = b"B"
+if PY3:
+    unichr = chr
+    array_format = "B"
+
+import array
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
+from .mobi_utils import toHex
+
+DEBUG_DICT = True
+
+class InflectionData(object):
+    def __init__(self, infldatas):
+        self.infldatas = infldatas
+        self.starts = []
+        self.counts = []
+        for idata in self.infldatas:
+            (start,) = struct.unpack_from(b">L", idata, 0x14)
+            (count,) = struct.unpack_from(b">L", idata, 0x18)
+            self.starts.append(start)
+            self.counts.append(count)
+
+    def lookup(self, lookupvalue):
+        i = 0
+        rvalue = lookupvalue
+        while rvalue >= self.counts[i]:
+            rvalue = rvalue - self.counts[i]
+            i += 1
+            if i == len(self.counts):
+                logger.debug("Error: Problem with multiple inflections data sections")
+                return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
+        return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
+
+    def offsets(self, value):
+        rvalue, start, count, data = self.lookup(value)
+        (offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue))
+        if rvalue + 1 < count:
+            (nextOffset,) = struct.unpack_from(
+                b">H", data, start + 4 + (2 * (rvalue + 1))
+            )
+        else:
+            nextOffset = None
+        return offset, nextOffset, data
+
+
+class dictSupport(object):
+    def __init__(self, mh, sect):
+        self.mh = mh
+        self.header = mh.header
+        self.sect = sect
+        self.metaOrthIndex = mh.metaOrthIndex
+        self.metaInflIndex = mh.metaInflIndex
+
+    def parseHeader(self, data):
+        "read INDX header"
+        if not data[:4] == b"INDX":
+            logger.debug("Warning: index section is not INDX")
+            return False
+        words = (
+            "len",
+            "nul1",
+            "type",
+            "gen",
+            "start",
+            "count",
+            "code",
+            "lng",
+            "total",
+            "ordt",
+            "ligt",
+            "nligt",
+            "nctoc",
+        )
+        num = len(words)
+        values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
+        header = {}
+        for n in range(num):
+            header[words[n]] = values[n]
+
+        ordt1 = None
+        ordt2 = None
+
+        otype, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
+        header["otype"] = otype
+        header["oentries"] = oentries
+
+        if DEBUG_DICT:
+            logger.debug(
+                "otype %d, oentries %d, op1 %d, op2 %d, otagx %d"
+                % (otype, oentries, op1, op2, otagx)
+            )
+
+        if header["code"] == 0xFDEA or oentries > 0:
+            # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
+            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
+            # So we need to look for them and store them away to process leading text
+            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
+            # we only ever seem to use the second but ...
+            #
+            # if otype = 0, ORDT table uses 16 bit values as offsets into the table
+            # if otype = 1, ORDT table uses 8 bit values as offsets inot the table
+
+            assert data[op1 : op1 + 4] == b"ORDT"
+            assert data[op2 : op2 + 4] == b"ORDT"
+            ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
+            ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)
+
+        if DEBUG_DICT:
+            logger.debug("parsed INDX header:")
+            for key in header:
+                logger.debug(
+                    key, "%x" % header[key],
+                )
+            logger.debug("\n")
+        return header, ordt1, ordt2
+
+    def getPositionMap(self):
+        sect = self.sect
+
+        positionMap = {}
+
+        metaOrthIndex = self.metaOrthIndex
+        metaInflIndex = self.metaInflIndex
+
+        decodeInflection = True
+        if metaOrthIndex != 0xFFFFFFFF:
+            logger.debug(
+                "Info: Document contains orthographic index, handle as dictionary"
+            )
+            if metaInflIndex == 0xFFFFFFFF:
+                decodeInflection = False
+            else:
+                metaInflIndexData = sect.loadSection(metaInflIndex)
+
+                logger.debug("\nParsing metaInflIndexData")
+                midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
+
+                metaIndexCount = midxhdr["count"]
+                idatas = []
+                for j in range(metaIndexCount):
+                    idatas.append(sect.loadSection(metaInflIndex + 1 + j))
+                dinfl = InflectionData(idatas)
+
+                inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
+                tagSectionStart = midxhdr["len"]
+                inflectionControlByteCount, inflectionTagTable = readTagSection(
+                    tagSectionStart, metaInflIndexData
+                )
+                if DEBUG_DICT:
+                    logger.debug("inflectionTagTable: %s" % inflectionTagTable)
+                if self.hasTag(inflectionTagTable, 0x07):
+                    logger.debug(
+                        "Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
+                    )
+                    decodeInflection = False
+
+            data = sect.loadSection(metaOrthIndex)
+
+            logger.debug("\nParsing metaOrthIndex")
+            idxhdr, hordt1, hordt2 = self.parseHeader(data)
+
+            tagSectionStart = idxhdr["len"]
+            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
+            orthIndexCount = idxhdr["count"]
+            logger.debug("orthIndexCount is", orthIndexCount)
+            if DEBUG_DICT:
+                logger.debug("orthTagTable: %s" % tagTable)
+            if hordt2 is not None:
+                logger.debug(
+                    "orth entry uses ordt2 lookup table of type ", idxhdr["otype"]
+                )
+            hasEntryLength = self.hasTag(tagTable, 0x02)
+            if not hasEntryLength:
+                logger.debug("Info: Index doesn't contain entry length tags")
+
+            logger.debug("Read dictionary index data")
+            for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
+                data = sect.loadSection(i)
+                hdrinfo, ordt1, ordt2 = self.parseHeader(data)
+                idxtPos = hdrinfo["start"]
+                entryCount = hdrinfo["count"]
+                idxPositions = []
+                for j in range(entryCount):
+                    (pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
+                    idxPositions.append(pos)
+                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
+                idxPositions.append(idxtPos)
+                for j in range(entryCount):
+                    startPos = idxPositions[j]
+                    endPos = idxPositions[j + 1]
+                    textLength = ord(data[startPos : startPos + 1])
+                    text = data[startPos + 1 : startPos + 1 + textLength]
+                    if hordt2 is not None:
+                        utext = ""
+                        if idxhdr["otype"] == 0:
+                            pattern = b">H"
+                            inc = 2
+                        else:
+                            pattern = b">B"
+                            inc = 1
+                        pos = 0
+                        while pos < textLength:
+                            (off,) = struct.unpack_from(pattern, text, pos)
+                            if off < len(hordt2):
+                                utext += unichr(hordt2[off])
+                            else:
+                                utext += unichr(off)
+                            pos += inc
+                        text = utext.encode("utf-8")
+
+                    tagMap = getTagMap(
+                        controlByteCount,
+                        tagTable,
+                        data,
+                        startPos + 1 + textLength,
+                        endPos,
+                    )
+                    if 0x01 in tagMap:
+                        if decodeInflection and 0x2A in tagMap:
+                            inflectionGroups = self.getInflectionGroups(
+                                text,
+                                inflectionControlByteCount,
+                                inflectionTagTable,
+                                dinfl,
+                                inflNameData,
+                                tagMap[0x2A],
+                            )
+                        else:
+                            inflectionGroups = b""
+                        assert len(tagMap[0x01]) == 1
+                        entryStartPosition = tagMap[0x01][0]
+                        if hasEntryLength:
+                            # The idx:entry attribute "scriptable" must be present to create entry length tags.
+                            ml = (
+                                b'<idx:entry scriptable="yes"><idx:orth value="'
+                                + text
+                                + b'">'
+                                + inflectionGroups
+                                + b"</idx:orth>"
+                            )
+                            if entryStartPosition in positionMap:
+                                positionMap[entryStartPosition] = (
+                                    positionMap[entryStartPosition] + ml
+                                )
+                            else:
+                                positionMap[entryStartPosition] = ml
+                            assert len(tagMap[0x02]) == 1
+                            entryEndPosition = entryStartPosition + tagMap[0x02][0]
+                            if entryEndPosition in positionMap:
+                                positionMap[entryEndPosition] = (
+                                    b"</idx:entry>" + positionMap[entryEndPosition]
+                                )
+                            else:
+                                positionMap[entryEndPosition] = b"</idx:entry>"
+
+                        else:
+                            indexTags = (
+                                b'<idx:entry>\n<idx:orth value="'
+                                + text
+                                + b'">\n'
+                                + inflectionGroups
+                                + b"</idx:entry>\n"
+                            )
+                            if entryStartPosition in positionMap:
+                                positionMap[entryStartPosition] = (
+                                    positionMap[entryStartPosition] + indexTags
+                                )
+                            else:
+                                positionMap[entryStartPosition] = indexTags
+        return positionMap
+
+    def hasTag(self, tagTable, tag):
+        """
+        Test if tag table contains given tag.
+
+        @param tagTable: The tag table.
+        @param tag: The tag to search.
+        @return: True if tag table contains given tag; False otherwise.
+        """
+        for currentTag, _, _, _ in tagTable:
+            if currentTag == tag:
+                return True
+        return False
+
+    def getInflectionGroups(
+        self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList
+    ):
+        """
+        Create string which contains the inflection groups with inflection rules as mobipocket tags.
+
+        @param mainEntry: The word to inflect.
+        @param controlByteCount: The number of control bytes.
+        @param tagTable: The tag table.
+        @param data: The Inflection data object to properly select the right inflection data section to use
+        @param inflectionNames: The inflection rule name data.
+        @param groupList: The list of inflection groups to process.
+        @return: String with inflection groups and rules or empty string if required tags are not available.
+        """
+        result = b""
+        for value in groupList:
+            offset, nextOffset, data = dinfl.offsets(value)
+
+            # First byte seems to be always 0x00 and must be skipped.
+            assert ord(data[offset : offset + 1]) == 0x00
+            tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
+
+            # Make sure that the required tags are available.
+            if 0x05 not in tagMap:
+                logger.debug("Error: Required tag 0x05 not found in tagMap")
+                return ""
+            if 0x1A not in tagMap:
+                logger.debug("Error: Required tag 0x1a not found in tagMap")
+                return b""
+
+            result += b"<idx:infl>"
+
+            for i in range(len(tagMap[0x05])):
+
+                # Get name of inflection rule.
+                value = tagMap[0x05][i]
+                consumed, textLength = getVariableWidthValue(inflectionNames, value)
+                inflectionName = inflectionNames[
+                    value + consumed : value + consumed + textLength
+                ]
+
+                # Get and apply inflection rule across possibly multiple inflection data sections
+                value = tagMap[0x1A][i]
+                rvalue, start, count, data = dinfl.lookup(value)
+                (offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue))
+                textLength = ord(data[offset : offset + 1])
+                inflection = self.applyInflectionRule(
+                    mainEntry, data, offset + 1, offset + 1 + textLength
+                )
+                if inflection is not None:
+                    result += (
+                        b'  <idx:iform name="'
+                        + inflectionName
+                        + b'" value="'
+                        + inflection
+                        + b'"/>'
+                    )
+
+            result += b"</idx:infl>"
+        return result
+
+    def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
+        """
+        Apply inflection rule.
+
+        @param mainEntry: The word to inflect.
+        @param inflectionRuleData: The inflection rules.
+        @param start: The start position of the inflection rule to use.
+        @param end: The end position of the inflection rule to use.
+        @return: The string with the inflected word or None if an error occurs.
+        """
+        mode = -1
+        byteArray = array.array(array_format, mainEntry)
+        position = len(byteArray)
+        for charOffset in range(start, end):
+            char = inflectionRuleData[charOffset : charOffset + 1]
+            abyte = ord(char)
+            if abyte >= 0x0A and abyte <= 0x13:
+                # Move cursor backwards
+                offset = abyte - 0x0A
+                if mode not in [0x02, 0x03]:
+                    mode = 0x02
+                    position = len(byteArray)
+                position -= offset
+            elif abyte > 0x13:
+                if mode == -1:
+                    logger.debug(
+                        "Error: Unexpected first byte %i of inflection rule" % abyte
+                    )
+                    return None
+                elif position == -1:
+                    logger.debug(
+                        "Error: Unexpected first byte %i of inflection rule" % abyte
+                    )
+                    return None
+                else:
+                    if mode == 0x01:
+                        # Insert at word start
+                        byteArray.insert(position, abyte)
+                        position += 1
+                    elif mode == 0x02:
+                        # Insert at word end
+                        byteArray.insert(position, abyte)
+                    elif mode == 0x03:
+                        # Delete at word end
+                        position -= 1
+                        deleted = byteArray.pop(position)
+                        if bchr(deleted) != char:
+                            if DEBUG_DICT:
+                                logger.debug(
+                                    "0x03: %s %s %s %s"
+                                    % (
+                                        mainEntry,
+                                        toHex(inflectionRuleData[start:end]),
+                                        char,
+                                        bchr(deleted),
+                                    )
+                                )
+                            logger.debug(
+                                "Error: Delete operation of inflection rule failed"
+                            )
+                            return None
+                    elif mode == 0x04:
+                        # Delete at word start
+                        deleted = byteArray.pop(position)
+                        if bchr(deleted) != char:
+                            if DEBUG_DICT:
+                                logger.debug(
+                                    "0x03: %s %s %s %s"
+                                    % (
+                                        mainEntry,
+                                        toHex(inflectionRuleData[start:end]),
+                                        char,
+                                        bchr(deleted),
+                                    )
+                                )
+                            logger.debug(
+                                "Error: Delete operation of inflection rule failed"
+                            )
+                            return None
+                    else:
+                        logger.debug(
+                            "Error: Inflection rule mode %x is not implemented" % mode
+                        )
+                        return None
+            elif abyte == 0x01:
+                # Insert at word start
+                if mode not in [0x01, 0x04]:
+                    position = 0
+                mode = abyte
+            elif abyte == 0x02:
+                # Insert at word end
+                if mode not in [0x02, 0x03]:
+                    position = len(byteArray)
+                mode = abyte
+            elif abyte == 0x03:
+                # Delete at word end
+                if mode not in [0x02, 0x03]:
+                    position = len(byteArray)
+                mode = abyte
+            elif abyte == 0x04:
+                # Delete at word start
+                if mode not in [0x01, 0x04]:
+                    position = 0
+                # Delete at word start
+                mode = abyte
+            else:
+                logger.debug(
+                    "Error: Inflection rule mode %x is not implemented" % abyte
+                )
+                return None
+        return utf8_str(byteArray.tostring())
--- a/mobiparse/mobi/mobi_header.py
+++ b/mobiparse/mobi/mobi_header.py
--- a/mobiparse/mobi/mobi_html.py
+++ b/mobiparse/mobi/mobi_html.py
@@ -0,0 +1,516 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, utf8_str
+from loguru import logger
+
+if PY2:
+    range = xrange
+
+import re
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_utils import fromBase32
+
+
+class HTMLProcessor:
+    def __init__(self, files, metadata, rscnames):
+        self.files = files
+        self.metadata = metadata
+        self.rscnames = rscnames
+        # for original style mobis, default to including all image files in the opf manifest
+        self.used = {}
+        for name in rscnames:
+            self.used[name] = "used"
+
+    def findAnchors(self, rawtext, indx_data, positionMap):
+        # process the raw text
+        # find anchors...
+        logger.debug("Find link anchors")
+        link_pattern = re.compile(
+            br"""<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>""", re.IGNORECASE
+        )
+        # TEST NCX: merge in filepos from indx
+        pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
+        if indx_data:
+            pos_indx = [e["pos"] for e in indx_data if e["pos"] > 0]
+            pos_links = list(set(pos_links + pos_indx))
+
+        for position in pos_links:
+            if position in positionMap:
+                positionMap[position] = positionMap[position] + utf8_str(
+                    '<a id="filepos%d" />' % position
+                )
+            else:
+                positionMap[position] = utf8_str('<a id="filepos%d" />' % position)
+
+        # apply dictionary metadata and anchors
+        logger.debug("Insert data into html")
+        pos = 0
+        lastPos = len(rawtext)
+        dataList = []
+        for end in sorted(positionMap.keys()):
+            if end == 0 or end > lastPos:
+                continue  # something's up - can't put a tag in outside <html>...</html>
+            dataList.append(rawtext[pos:end])
+            dataList.append(positionMap[end])
+            pos = end
+        dataList.append(rawtext[pos:])
+        srctext = b"".join(dataList)
+        rawtext = None
+        dataList = None
+        self.srctext = srctext
+        self.indx_data = indx_data
+        return srctext
+
+    def insertHREFS(self):
+        srctext = self.srctext
+        rscnames = self.rscnames
+        metadata = self.metadata
+
+        # put in the hrefs
+        logger.debug("Insert hrefs into html")
+        # There doesn't seem to be a standard, so search as best as we can
+
+        link_pattern = re.compile(
+            br"""<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>""", re.IGNORECASE
+        )
+        srctext = link_pattern.sub(br"""<a\1href="#filepos\2"\3>""", srctext)
+
+        # remove empty anchors
+        logger.debug("Remove empty anchors from html")
+        srctext = re.sub(br"<a\s*/>", br"", srctext)
+        srctext = re.sub(br"<a\s*>\s*</a>", br"", srctext)
+
+        # convert image references
+        logger.debug("Insert image references into html")
+        # split string into image tag pieces and other pieces
+        image_pattern = re.compile(br"""(<img.*?>)""", re.IGNORECASE)
+        image_index_pattern = re.compile(
+            br"""recindex=['"]{0,1}([0-9]+)['"]{0,1}""", re.IGNORECASE
+        )
+        srcpieces = image_pattern.split(srctext)
+        srctext = self.srctext = None
+
+        # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
+        for i in range(1, len(srcpieces), 2):
+            tag = srcpieces[i]
+            for m in image_index_pattern.finditer(tag):
+                imageNumber = int(m.group(1))
+                imageName = rscnames[imageNumber - 1]
+                if imageName is None:
+                    logger.debug(
+                        "Error: Referenced image %s was not recognized as a valid image"
+                        % imageNumber
+                    )
+                else:
+                    replacement = b'src="Images/' + utf8_str(imageName) + b'"'
+                    tag = image_index_pattern.sub(replacement, tag, 1)
+            srcpieces[i] = tag
+        srctext = b"".join(srcpieces)
+
+        # add in character set meta into the html header if needed
+        if "Codec" in metadata:
+            srctext = (
+                srctext[0:12]
+                + b'<meta http-equiv="content-type" content="text/html; charset='
+                + utf8_str(metadata.get("Codec")[0])
+                + b'" />'
+                + srctext[12:]
+            )
+        return srctext, self.used
+
+
+class XHTMLK8Processor:
+    def __init__(self, rscnames, k8proc):
+        self.rscnames = rscnames
+        self.k8proc = k8proc
+        self.used = {}
+
+    def buildXHTML(self):
+
+        # first need to update all links that are internal which
+        # are based on positions within the xhtml files **BEFORE**
+        # cutting and pasting any pieces into the xhtml text files
+
+        #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
+        #       XXXX is the offset in records into divtbl
+        #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
+
+        # pos:fid pattern
+        posfid_pattern = re.compile(br"""(<a.*?href=.*?>)""", re.IGNORECASE)
+        posfid_index_pattern = re.compile(
+            br"""['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']"""
+        )
+
+        parts = []
+        logger.debug("Building proper xhtml for each file")
+        for i in range(self.k8proc.getNumberOfParts()):
+            part = self.k8proc.getPart(i)
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
+
+            # internal links
+            srcpieces = posfid_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<"):
+                    for m in posfid_index_pattern.finditer(tag):
+                        posfid = m.group(1)
+                        offset = m.group(2)
+                        filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
+                        if idtag == b"":
+                            replacement = b'"' + utf8_str(filename) + b'"'
+                        else:
+                            replacement = (
+                                b'"' + utf8_str(filename) + b"#" + idtag + b'"'
+                            )
+                        tag = posfid_index_pattern.sub(replacement, tag, 1)
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            parts.append(part)
+
+        # we are free to cut and paste as we see fit
+        # we can safely remove all of the Kindlegen generated aid tags
+        # change aid ids that are in k8proc.linked_aids to xhtml ids
+        find_tag_with_aid_pattern = re.compile(
+            br"""(<[^>]*\said\s*=[^>]*>)""", re.IGNORECASE
+        )
+        within_tag_aid_position_pattern = re.compile(br"""\said\s*=['"]([^'"]*)['"]""")
+        for i in range(len(parts)):
+            part = parts[i]
+            srcpieces = find_tag_with_aid_pattern.split(part)
+            for j in range(len(srcpieces)):
+                tag = srcpieces[j]
+                if tag.startswith(b"<"):
+                    for m in within_tag_aid_position_pattern.finditer(tag):
+                        try:
+                            aid = m.group(1)
+                        except IndexError:
+                            aid = None
+                        replacement = b""
+                        if aid in self.k8proc.linked_aids:
+                            replacement = b' id="aid-' + aid + b'"'
+                        tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            parts[i] = part
+
+        # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
+        # with page-break-after style patterns
+        find_tag_with_AmznPageBreak_pattern = re.compile(
+            br"""(<[^>]*\sdata-AmznPageBreak=[^>]*>)""", re.IGNORECASE
+        )
+        within_tag_AmznPageBreak_position_pattern = re.compile(
+            br"""\sdata-AmznPageBreak=['"]([^'"]*)['"]"""
+        )
+        for i in range(len(parts)):
+            part = parts[i]
+            srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
+            for j in range(len(srcpieces)):
+                tag = srcpieces[j]
+                if tag.startswith(b"<"):
+                    srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
+                        lambda m: b' style="page-break-after:' + m.group(1) + b'"', tag
+                    )
+            part = b"".join(srcpieces)
+            parts[i] = part
+
+        # we have to handle substitutions for the flows  pieces first as they may
+        # be inlined into the xhtml text
+        #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+        #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+        #   kindle:embed:XXXX   (used for fonts)
+
+        flows = []
+        flows.append(None)
+        flowinfo = []
+        flowinfo.append([None, None, None, None])
+
+        # regular expression search patterns
+        img_pattern = re.compile(br"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE)
+        img_index_pattern = re.compile(
+            br"""[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]""", re.IGNORECASE
+        )
+
+        tag_pattern = re.compile(br"""(<[^>]*>)""")
+        flow_pattern = re.compile(
+            br"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE
+        )
+
+        url_pattern = re.compile(br"""(url\(.*?\))""", re.IGNORECASE)
+        url_img_index_pattern = re.compile(
+            br"""[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]""",
+            re.IGNORECASE,
+        )
+        font_index_pattern = re.compile(
+            br"""[('"]kindle:embed:([0-9|A-V]+)["')]""", re.IGNORECASE
+        )
+        url_css_index_pattern = re.compile(
+            br"""kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*""", re.IGNORECASE
+        )
+        url_svg_image_pattern = re.compile(
+            br"""kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*""", re.IGNORECASE
+        )
+
+        for i in range(1, self.k8proc.getNumberOfFlows()):
+            [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
+            flowpart = self.k8proc.getFlow(i)
+
+            # links to raster image files from image tags
+            # image_pattern
+            srcpieces = img_pattern.split(flowpart)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<im"):
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber - 1]
+                        if imageName is not None:
+                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
+                            self.used[imageName] = "used"
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            logger.debug(
+                                "Error: Referenced image %s was not recognized as a valid image in %s"
+                                % (imageNumber, tag)
+                            )
+                    srcpieces[j] = tag
+            flowpart = b"".join(srcpieces)
+
+            # replacements inside css url():
+            srcpieces = url_pattern.split(flowpart)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+
+                #  process links to raster image files
+                for m in url_img_index_pattern.finditer(tag):
+                    imageNumber = fromBase32(m.group(1))
+                    imageName = self.rscnames[imageNumber - 1]
+                    osep = m.group()[0:1]
+                    csep = m.group()[-1:]
+                    if imageName is not None:
+                        replacement = osep + b"../Images/" + utf8_str(imageName) + csep
+                        self.used[imageName] = "used"
+                        tag = url_img_index_pattern.sub(replacement, tag, 1)
+                    else:
+                        logger.debug(
+                            "Error: Referenced image %s was not recognized as a valid image in %s"
+                            % (imageNumber, tag)
+                        )
+
+                # process links to fonts
+                for m in font_index_pattern.finditer(tag):
+                    fontNumber = fromBase32(m.group(1))
+                    fontName = self.rscnames[fontNumber - 1]
+                    osep = m.group()[0:1]
+                    csep = m.group()[-1:]
+                    if fontName is None:
+                        logger.debug(
+                            "Error: Referenced font %s was not recognized as a valid font in %s"
+                            % (fontNumber, tag)
+                        )
+                    else:
+                        replacement = osep + b"../Fonts/" + utf8_str(fontName) + csep
+                        tag = font_index_pattern.sub(replacement, tag, 1)
+                        self.used[fontName] = "used"
+
+                # process links to other css pieces
+                for m in url_css_index_pattern.finditer(tag):
+                    num = fromBase32(m.group(1))
+                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                    replacement = b'"../' + utf8_str(pdir) + b"/" + utf8_str(fnm) + b'"'
+                    tag = url_css_index_pattern.sub(replacement, tag, 1)
+                    self.used[fnm] = "used"
+
+                # process links to svg images
+                for m in url_svg_image_pattern.finditer(tag):
+                    num = fromBase32(m.group(1))
+                    [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                    replacement = b'"../' + utf8_str(pdir) + b"/" + utf8_str(fnm) + b'"'
+                    tag = url_svg_image_pattern.sub(replacement, tag, 1)
+                    self.used[fnm] = "used"
+
+                srcpieces[j] = tag
+            flowpart = b"".join(srcpieces)
+
+            # store away in our own copy
+            flows.append(flowpart)
+
+            # I do not think this case exists and even if it does exist, it needs to be done in a separate
+            # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
+            # target one has been fully processed
+
+            # but keep it around if it ends up we do need it
+
+            # flow pattern not inside url()
+            # srcpieces = tag_pattern.split(flowpart)
+            # for j in range(1, len(srcpieces),2):
+            #     tag = srcpieces[j]
+            #     if tag.startswith(b'<'):
+            #         for m in flow_pattern.finditer(tag):
+            #             num = fromBase32(m.group(1))
+            #             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+            #             flowtext = self.k8proc.getFlow(num)
+            #             if fmt == b'inline':
+            #                 tag = flowtext
+            #             else:
+            #                 replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
+            #                 tag = flow_pattern.sub(replacement, tag, 1)
+            #                 self.used[fnm] = 'used'
+            #         srcpieces[j] = tag
+            # flowpart = b"".join(srcpieces)
+
+        # now handle the main text xhtml parts
+
+        # Handle the flow items in the XHTML text pieces
+        # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
+        tag_pattern = re.compile(br"""(<[^>]*>)""")
+        flow_pattern = re.compile(
+            br"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE
+        )
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+            # flow pattern
+            srcpieces = tag_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<"):
+                    for m in flow_pattern.finditer(tag):
+                        num = fromBase32(m.group(1))
+                        if num > 0 and num < len(self.k8proc.flowinfo):
+                            [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
+                            flowpart = flows[num]
+                            if fmt == b"inline":
+                                tag = flowpart
+                            else:
+                                replacement = (
+                                    b'"../'
+                                    + utf8_str(pdir)
+                                    + b"/"
+                                    + utf8_str(fnm)
+                                    + b'"'
+                                )
+                                tag = flow_pattern.sub(replacement, tag, 1)
+                                self.used[fnm] = "used"
+                        else:
+                            print(
+                                "warning: ignoring non-existent flow link",
+                                tag,
+                                " value 0x%x" % num,
+                            )
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+
+            # store away modified version
+            parts[i] = part
+
+        # Handle any embedded raster images links in style= attributes urls
+        style_pattern = re.compile(
+            br"""(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)""", re.IGNORECASE
+        )
+        img_index_pattern = re.compile(
+            br"""[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]""", re.IGNORECASE
+        )
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # replace urls in style attributes
+            srcpieces = style_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if b"kindle:embed" in tag:
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber - 1]
+                        osep = m.group()[0:1]
+                        csep = m.group()[-1:]
+                        if imageName is not None:
+                            replacement = (
+                                osep + b"../Images/" + utf8_str(imageName) + csep
+                            )
+                            self.used[imageName] = "used"
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            logger.debug(
+                                "Error: Referenced image %s in style url was not recognized in %s"
+                                % (imageNumber, tag)
+                            )
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+
+            # store away modified version
+            parts[i] = part
+
+        # Handle any embedded raster images links in the xhtml text
+        # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
+        img_pattern = re.compile(br"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE)
+        img_index_pattern = re.compile(br"""['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]""")
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # links to raster image files
+            # image_pattern
+            srcpieces = img_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<im"):
+                    for m in img_index_pattern.finditer(tag):
+                        imageNumber = fromBase32(m.group(1))
+                        imageName = self.rscnames[imageNumber - 1]
+                        if imageName is not None:
+                            replacement = b'"../Images/' + utf8_str(imageName) + b'"'
+                            self.used[imageName] = "used"
+                            tag = img_index_pattern.sub(replacement, tag, 1)
+                        else:
+                            logger.debug(
+                                "Error: Referenced image %s was not recognized as a valid image in %s"
+                                % (imageNumber, tag)
+                            )
+                    srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            # store away modified version
+            parts[i] = part
+
+        # finally perform any general cleanups needed to make valid XHTML
+        # these include:
+        #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
+        #   in svg tags replace "viewbox" attributes with "viewBox"
+        #   in <li> remove value="XX" attributes since these are illegal
+        tag_pattern = re.compile(br"""(<[^>]*>)""")
+        li_value_pattern = re.compile(
+            br"""\svalue\s*=\s*['"][^'"]*['"]""", re.IGNORECASE
+        )
+
+        for i in range(len(parts)):
+            part = parts[i]
+            [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
+
+            # tag pattern
+            srcpieces = tag_pattern.split(part)
+            for j in range(1, len(srcpieces), 2):
+                tag = srcpieces[j]
+                if tag.startswith(b"<svg") or tag.startswith(b"<SVG"):
+                    tag = tag.replace(b"preserveaspectratio", b"preserveAspectRatio")
+                    tag = tag.replace(b"viewbox", b"viewBox")
+                elif tag.startswith(b"<li ") or tag.startswith(b"<LI "):
+                    tagpieces = li_value_pattern.split(tag)
+                    tag = b"".join(tagpieces)
+                srcpieces[j] = tag
+            part = b"".join(srcpieces)
+            # store away modified version
+            parts[i] = part
+
+        self.k8proc.setFlows(flows)
+        self.k8proc.setParts(parts)
+
+        return self.used
--- a/mobiparse/mobi/mobi_index.py
+++ b/mobiparse/mobi/mobi_index.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bchr, bstr, bord
+from loguru import logger
+
+if PY2:
+    range = xrange
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .mobi_utils import toHex
+
+
+class MobiIndex:
+    # CGDBG
+    def __init__(self, sect, DEBUG=True):
+        self.sect = sect
+        self.DEBUG = DEBUG
+
+    def getIndexData(self, idx, label="Unknown"):
+        sect = self.sect
+        outtbl = []
+        ctoc_text = {}
+        if idx != 0xFFFFFFFF:
+            sect.setsectiondescription(idx, "{0} Main INDX section".format(label))
+            data = sect.loadSection(idx)
+            idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
+            IndexCount = idxhdr["count"]
+            # handle the case of multiple sections used for CTOC
+            rec_off = 0
+            off = idx + IndexCount + 1
+            for j in range(idxhdr["nctoc"]):
+                cdata = sect.loadSection(off + j)
+                sect.setsectiondescription(off + j, label + " CTOC Data " + str(j))
+                ctocdict = self.readCTOC(cdata)
+                for k in ctocdict:
+                    ctoc_text[k + rec_off] = ctocdict[k]
+                rec_off += 0x10000
+            tagSectionStart = idxhdr["len"]
+            controlByteCount, tagTable = readTagSection(tagSectionStart, data)
+            if self.DEBUG:
+                logger.debug("ControlByteCount is", controlByteCount)
+                logger.debug("IndexCount is", IndexCount)
+                logger.debug("TagTable: %s" % tagTable)
+            for i in range(idx + 1, idx + 1 + IndexCount):
+                sect.setsectiondescription(
+                    i, "{0} Extra {1:d} INDX section".format(label, i - idx)
+                )
+                data = sect.loadSection(i)
+                hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
+                idxtPos = hdrinfo["start"]
+                entryCount = hdrinfo["count"]
+                if self.DEBUG:
+                    logger.debug("%s %s" % (idxtPos, entryCount))
+                # loop through to build up the IDXT position starts
+                idxPositions = []
+                for j in range(entryCount):
+                    (pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
+                    idxPositions.append(pos)
+                # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
+                idxPositions.append(idxtPos)
+                # for each entry in the IDXT build up the tagMap and any associated text
+                for j in range(entryCount):
+                    startPos = idxPositions[j]
+                    endPos = idxPositions[j + 1]
+                    textLength = ord(data[startPos : startPos + 1])
+                    text = data[startPos + 1 : startPos + 1 + textLength]
+                    if hordt2 is not None:
+                        text = b"".join(bchr(hordt2[bord(x)]) for x in text)
+                    tagMap = getTagMap(
+                        controlByteCount,
+                        tagTable,
+                        data,
+                        startPos + 1 + textLength,
+                        endPos,
+                    )
+                    outtbl.append([text, tagMap])
+                    if self.DEBUG:
+                        # CGDBG
+                        logger.debug('tagMap {}'.format(tagMap))
+                        logger.debug('text {}'.format(text))
+                        logger.debug('data {}'.format(data))
+
+        return outtbl, ctoc_text
+
+    def parseINDXHeader(self, data):
+        "read INDX header"
+        if not data[:4] == b"INDX":
+            logger.debug("Warning: index section is not INDX")
+            return False
+        words = (
+            "len",
+            "nul1",
+            "type",
+            "gen",
+            "start",
+            "count",
+            "code",
+            "lng",
+            "total",
+            "ordt",
+            "ligt",
+            "nligt",
+            "nctoc",
+        )
+        num = len(words)
+        values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
+        header = {}
+        for n in range(num):
+            header[words[n]] = values[n]
+
+        ordt1 = None
+        ordt2 = None
+
+        ocnt, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
+        if header["code"] == 0xFDEA or ocnt != 0 or oentries > 0:
+            # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
+            # them in the proper place in the header.  They seem to be codepage 65002 which seems
+            # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
+
+            # so we need to look for them and store them away to process leading text
+            # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
+            # we only ever seem to use the seocnd but ...
+            assert ocnt == 1
+            assert data[op1 : op1 + 4] == b"ORDT"
+            assert data[op2 : op2 + 4] == b"ORDT"
+            ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
+            ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)
+
+        if self.DEBUG:
+            logger.debug("parsed INDX header:")
+            for n in words:
+                print(
+                    n, "%X" % header[n],
+                )
+            logger.debug("")
+        return header, ordt1, ordt2
+
+    def readCTOC(self, txtdata):
+        # read all blocks from CTOC
+        ctoc_data = {}
+        offset = 0
+        while offset < len(txtdata):
+            if PY2:
+                if txtdata[offset] == b"\0":
+                    break
+            else:
+                if txtdata[offset] == 0:
+                    break
+            idx_offs = offset
+            # first n bytes: name len as vwi
+            pos, ilen = getVariableWidthValue(txtdata, offset)
+            offset += pos
+            # <len> next bytes: name
+            name = txtdata[offset : offset + ilen]
+            offset += ilen
+            if self.DEBUG:
+                logger.debug("name length is %s" % ilen)
+                logger.debug("%s %s", (idx_offs, name))
+            ctoc_data[idx_offs] = name
+        return ctoc_data
+
+
+def getVariableWidthValue(data, offset):
+    """
+    Decode variable width value from given bytes.
+
+    @param data: The bytes to decode.
+    @param offset: The start offset into data.
+    @return: Tuple of consumed bytes count and decoded value.
+    """
+    value = 0
+    consumed = 0
+    finished = False
+    while not finished:
+        v = data[offset + consumed : offset + consumed + 1]
+        consumed += 1
+        if ord(v) & 0x80:
+            finished = True
+        value = (value << 7) | (ord(v) & 0x7F)
+    return consumed, value
+
+
+def readTagSection(start, data):
+    """
+    Read tag section from given data.
+
+    @param start: The start position in the data.
+    @param data: The data to process.
+    @return: Tuple of control byte count and list of tag tuples.
+    """
+    controlByteCount = 0
+    tags = []
+    if data[start : start + 4] == b"TAGX":
+        (firstEntryOffset,) = struct.unpack_from(b">L", data, start + 0x04)
+        (controlByteCount,) = struct.unpack_from(b">L", data, start + 0x08)
+
+        # Skip the first 12 bytes already read above.
+        for i in range(12, firstEntryOffset, 4):
+            pos = start + i
+            tags.append(
+                (
+                    ord(data[pos : pos + 1]),
+                    ord(data[pos + 1 : pos + 2]),
+                    ord(data[pos + 2 : pos + 3]),
+                    ord(data[pos + 3 : pos + 4]),
+                )
+            )
+    return controlByteCount, tags
+
+
+def countSetBits(value, bits=8):
+    """
+    Count the set bits in the given value.
+
+    @param value: Integer value.
+    @param bits: The number of bits of the input value (defaults to 8).
+    @return: Number of set bits.
+    """
+    count = 0
+    for _ in range(bits):
+        if value & 0x01 == 0x01:
+            count += 1
+        value = value >> 1
+    return count
+
+
+def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
+    """
+    Create a map of tags and values from the given byte section.
+
+    @param controlByteCount: The number of control bytes.
+    @param tagTable: The tag table.
+    @param entryData: The data to process.
+    @param startPos: The starting position in entryData.
+    @param endPos: The end position in entryData or None if it is unknown.
+    @return: Hashmap of tag and list of values.
+    """
+    tags = []
+    tagHashMap = {}
+    controlByteIndex = 0
+    dataStart = startPos + controlByteCount
+
+    for tag, valuesPerEntry, mask, endFlag in tagTable:
+        if endFlag == 0x01:
+            controlByteIndex += 1
+            continue
+        cbyte = ord(
+            entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
+        )
+        if 0:
+            logger.debug(
+                "Control Byte Index %0x , Control Byte Value %0x"
+                % (controlByteIndex, cbyte)
+            )
+
+        value = (
+            ord(
+                entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
+            )
+            & mask
+        )
+        if value != 0:
+            if value == mask:
+                if countSetBits(mask) > 1:
+                    # If all bits of masked value are set and the mask has more than one bit, a variable width value
+                    # will follow after the control bytes which defines the length of bytes (NOT the value count!)
+                    # which will contain the corresponding variable width values.
+                    consumed, value = getVariableWidthValue(entryData, dataStart)
+                    dataStart += consumed
+                    tags.append((tag, None, value, valuesPerEntry))
+                else:
+                    tags.append((tag, 1, None, valuesPerEntry))
+            else:
+                # Shift bits to get the masked value.
+                while mask & 0x01 == 0:
+                    mask = mask >> 1
+                    value = value >> 1
+                tags.append((tag, value, None, valuesPerEntry))
+    for tag, valueCount, valueBytes, valuesPerEntry in tags:
+        values = []
+        if valueCount is not None:
+            # Read valueCount * valuesPerEntry variable width values.
+            for _ in range(valueCount):
+                for _ in range(valuesPerEntry):
+                    consumed, data = getVariableWidthValue(entryData, dataStart)
+                    dataStart += consumed
+                    values.append(data)
+        else:
+            # Convert valueBytes to variable width values.
+            totalConsumed = 0
+            while totalConsumed < valueBytes:
+                # Does this work for valuesPerEntry != 1?
+                consumed, data = getVariableWidthValue(entryData, dataStart)
+                dataStart += consumed
+                totalConsumed += consumed
+                values.append(data)
+            if totalConsumed != valueBytes:
+                logger.debug(
+                    "Error: Should consume %s bytes, but consumed %s"
+                    % (valueBytes, totalConsumed)
+                )
+        tagHashMap[tag] = values
+    # Test that all bytes have been processed if endPos is given.
+    if endPos is not None and dataStart != endPos:
+        # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
+        for char in entryData[dataStart:endPos]:
+            if bord(char) != 0:
+                logger.debug(
+                    "Warning: There are unprocessed index bytes left: %s"
+                    % toHex(entryData[dataStart:endPos])
+                )
+                if 0:
+                    logger.debug("controlByteCount: %s" % controlByteCount)
+                    logger.debug("tagTable: %s" % tagTable)
+                    logger.debug("data: %s" % toHex(entryData[startPos:endPos]))
+                    logger.debug("tagHashMap: %s" % tagHashMap)
+                break
+
+    return tagHashMap
--- a/mobiparse/mobi/mobi_k8proc.py
+++ b/mobiparse/mobi/mobi_k8proc.py
@@ -0,0 +1,575 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bstr, utf8_str
+from loguru import logger
+
+if PY2:
+    range = xrange
+
+import os
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+import re
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+from .mobi_index import MobiIndex
+from .mobi_utils import fromBase32
+from .unipath import pathof
+
+_guide_types = [
+    b"cover",
+    b"title-page",
+    b"toc",
+    b"index",
+    b"glossary",
+    b"acknowledgements",
+    b"bibliography",
+    b"colophon",
+    b"copyright-page",
+    b"dedication",
+    b"epigraph",
+    b"foreward",
+    b"loi",
+    b"lot",
+    b"notes",
+    b"preface",
+    b"text",
+]
+
+# locate beginning and ending positions of tag with specific aid attribute
+def locate_beg_end_of_tag(ml, aid):
+    pattern = utf8_str(r"""<[^>]*\said\s*=\s*['"]%s['"][^>]*>""" % aid)
+    aid_pattern = re.compile(pattern, re.IGNORECASE)
+    for m in re.finditer(aid_pattern, ml):
+        plt = m.start()
+        pgt = ml.find(b">", plt + 1)
+        return plt, pgt
+    return 0, 0
+
+
+# iterate over all tags in block in reverse order, i.e. last ta to first tag
+def reverse_tag_iter(block):
+    end = len(block)
+    while True:
+        pgt = block.rfind(b">", 0, end)
+        if pgt == -1:
+            break
+        plt = block.rfind(b"<", 0, pgt)
+        if plt == -1:
+            break
+        yield block[plt : pgt + 1]
+        end = plt
+
+
+class K8Processor:
+    def __init__(self, mh, sect, files, debug=False):
+        self.sect = sect
+        self.files = files
+        self.mi = MobiIndex(sect)
+        self.mh = mh
+        self.skelidx = mh.skelidx
+        self.fragidx = mh.fragidx
+        self.guideidx = mh.guideidx
+        self.fdst = mh.fdst
+        self.flowmap = {}
+        self.flows = None
+        self.flowinfo = []
+        self.parts = None
+        self.partinfo = []
+        self.linked_aids = set()
+        self.fdsttbl = [0, 0xFFFFFFFF]
+        self.DEBUG = debug
+
+        # read in and parse the FDST info which is very similar in format to the Palm DB section
+        # parsing except it provides offsets into rawML file and not the Palm DB file
+        # this is needed to split up the final css, svg, etc flow section
+        # that can exist at the end of the rawML file
+        if self.fdst != 0xFFFFFFFF:
+            header = self.sect.loadSection(self.fdst)
+            if header[0:4] == b"FDST":
+                (num_sections,) = struct.unpack_from(b">L", header, 0x08)
+                self.fdsttbl = struct.unpack_from(
+                    bstr(">%dL" % (num_sections * 2)), header, 12
+                )[::2] + (mh.rawSize,)
+                sect.setsectiondescription(self.fdst, "KF8 FDST INDX")
+                if self.DEBUG:
+                    logger.debug("\nFDST Section Map:  %d sections" % num_sections)
+                    for j in range(num_sections):
+                        logger.debug(
+                            "Section %d: 0x%08X - 0x%08X"
+                            % (j, self.fdsttbl[j], self.fdsttbl[j + 1])
+                        )
+            else:
+                logger.debug("\nError: K8 Mobi with Missing FDST info")
+
+        # read/process skeleton index info to create the skeleton table
+        skeltbl = []
+        if self.skelidx != 0xFFFFFFFF:
+            # for i in range(2):
+            #     fname = 'skel%04d.dat' % i
+            #     data = self.sect.loadSection(self.skelidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
+            fileptr = 0
+            for [text, tagMap] in outtbl:
+                # file number, skeleton name, fragtbl record count, start position, length
+                skeltbl.append(
+                    [fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]
+                )
+                fileptr += 1
+        self.skeltbl = skeltbl
+        if self.DEBUG:
+            logger.debug("\nSkel Table:  %d entries" % len(self.skeltbl))
+            logger.debug(
+                "table: filenum, skeleton name, frag tbl record count, start position, length"
+            )
+            for j in range(len(self.skeltbl)):
+                logger.debug(self.skeltbl[j])
+
+        # read/process the fragment index to create the fragment table
+        fragtbl = []
+        if self.fragidx != 0xFFFFFFFF:
+            # for i in range(3):
+            #     fname = 'frag%04d.dat' % i
+            #     data = self.sect.loadSection(self.fragidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
+            for [text, tagMap] in outtbl:
+                # insert position, ctoc offset (aidtext), file number, sequence number, start position, length
+                ctocoffset = tagMap[2][0]
+                ctocdata = ctoc_text[ctocoffset]
+                fragtbl.append(
+                    [
+                        int(text),
+                        ctocdata,
+                        tagMap[3][0],
+                        tagMap[4][0],
+                        tagMap[6][0],
+                        tagMap[6][1],
+                    ]
+                )
+        self.fragtbl = fragtbl
+        if self.DEBUG:
+            logger.debug("\nFragment Table: %d entries" % len(self.fragtbl))
+            logger.debug(
+                "table: file position, link id text, file num, sequence number, start position, length"
+            )
+            for j in range(len(self.fragtbl)):
+                logger.debug(self.fragtbl[j])
+
+        # read / process guide index for guide elements of opf
+        guidetbl = []
+        if self.guideidx != 0xFFFFFFFF:
+            # for i in range(3):
+            #     fname = 'guide%04d.dat' % i
+            #     data = self.sect.loadSection(self.guideidx + i)
+            #     with open(pathof(fname), 'wb') as f:
+            #         f.write(data)
+            outtbl, ctoc_text = self.mi.getIndexData(
+                self.guideidx, "KF8 Guide elements)"
+            )
+            for [text, tagMap] in outtbl:
+                # ref_type, ref_title, frag number
+                ctocoffset = tagMap[1][0]
+                ref_title = ctoc_text[ctocoffset]
+                ref_type = text
+                fileno = None
+                if 3 in tagMap:
+                    fileno = tagMap[3][0]
+                if 6 in tagMap:
+                    fileno = tagMap[6][0]
+                guidetbl.append([ref_type, ref_title, fileno])
+        self.guidetbl = guidetbl
+        if self.DEBUG:
+            logger.debug("\nGuide Table: %d entries" % len(self.guidetbl))
+            logger.debug("table: ref_type, ref_title, fragtbl entry number")
+            for j in range(len(self.guidetbl)):
+                logger.debug(self.guidetbl[j])
+
+    def buildParts(self, rawML):
+        # now split the rawML into its flow pieces
+        self.flows = []
+        for j in range(0, len(self.fdsttbl) - 1):
+            start = self.fdsttbl[j]
+            end = self.fdsttbl[j + 1]
+            self.flows.append(rawML[start:end])
+
+        # the first piece represents the xhtml text
+        text = self.flows[0]
+        self.flows[0] = b""
+
+        # walk the <skeleton> and fragment tables to build original source xhtml files
+        # *without* destroying any file position information needed for later href processing
+        # and create final list of file separation start: stop points and etc in partinfo
+        if self.DEBUG:
+            logger.debug("\nRebuilding flow piece 0: the main body of the ebook")
+        self.parts = []
+        self.partinfo = []
+        fragptr = 0
+        baseptr = 0
+        cnt = 0
+        filename = "part%04d.xhtml" % cnt
+        for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
+            baseptr = skelpos + skellen
+            skeleton = text[skelpos:baseptr]
+            aidtext = "0"
+            for i in range(fragcnt):
+                [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[
+                    fragptr
+                ]
+                aidtext = idtext[12:-2]
+                if i == 0:
+                    filename = "part%04d.xhtml" % filenum
+                slice = text[baseptr : baseptr + length]
+                insertpos = insertpos - skelpos
+                head = skeleton[:insertpos]
+                tail = skeleton[insertpos:]
+                actual_inspos = insertpos
+                if tail.find(b">") < tail.find(b"<") or head.rfind(b">") < head.rfind(
+                    b"<"
+                ):
+                    # There is an incomplete tag in either the head or tail.
+                    # This can happen for some badly formed KF8 files
+                    logger.debug(
+                        "The fragment table for %s has incorrect insert position. Calculating manually."
+                        % skelname
+                    )
+                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
+                    if bp != ep:
+                        actual_inspos = ep + 1 + startpos
+                if insertpos != actual_inspos:
+                    print(
+                        "fixed corrupt fragment table insert position",
+                        insertpos + skelpos,
+                        actual_inspos + skelpos,
+                    )
+                    insertpos = actual_inspos
+                    self.fragtbl[fragptr][0] = actual_inspos + skelpos
+                skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
+                baseptr = baseptr + length
+                fragptr += 1
+            cnt += 1
+            self.parts.append(skeleton)
+            self.partinfo.append([skelnum, "Text", filename, skelpos, baseptr, aidtext])
+
+        assembled_text = b"".join(self.parts)
+        if self.DEBUG:
+            outassembled = os.path.join(self.files.k8dir, "assembled_text.dat")
+            with open(pathof(outassembled), "wb") as f:
+                f.write(assembled_text)
+
+        # The primary css style sheet is typically stored next followed by any
+        # snippets of code that were previously inlined in the
+        # original xhtml but have been stripped out and placed here.
+        # This can include local CDATA snippets and and svg sections.
+
+        # The problem is that for most browsers and ereaders, you can not
+        # use <img src="imageXXXX.svg" /> to import any svg image that itself
+        # properly uses an <image/> tag to import some raster image - it
+        # should work according to the spec but does not for almost all browsers
+        # and ereaders and causes epub validation issues because those  raster
+        # images are in manifest but not in xhtml text - since they only
+        # referenced from an svg image
+
+        # So we need to check the remaining flow pieces to see if they are css
+        # or svg images.  if svg images, we must check if they have an <image />
+        # and if so inline them into the xhtml text pieces.
+
+        # there may be other sorts of pieces stored here but until we see one
+        # in the wild to reverse engineer we won't be able to tell
+        self.flowinfo.append([None, None, None, None])
+        svg_tag_pattern = re.compile(br"""(<svg[^>]*>)""", re.IGNORECASE)
+        image_tag_pattern = re.compile(br"""(<image[^>]*>)""", re.IGNORECASE)
+        for j in range(1, len(self.flows)):
+            flowpart = self.flows[j]
+            nstr = "%04d" % j
+            m = re.search(svg_tag_pattern, flowpart)
+            if m is not None:
+                # svg
+                ptype = b"svg"
+                start = m.start()
+                m2 = re.search(image_tag_pattern, flowpart)
+                if m2 is not None:
+                    pformat = b"inline"
+                    pdir = None
+                    fname = None
+                    # strip off anything before <svg if inlining
+                    flowpart = flowpart[start:]
+                else:
+                    pformat = b"file"
+                    pdir = "Images"
+                    fname = "svgimg" + nstr + ".svg"
+            else:
+                # search for CDATA and if exists inline it
+                if flowpart.find(b"[CDATA[") >= 0:
+                    ptype = b"css"
+                    flowpart = b'<style type="text/css">\n' + flowpart + b"\n</style>\n"
+                    pformat = b"inline"
+                    pdir = None
+                    fname = None
+                else:
+                    # css - assume as standalone css file
+                    ptype = b"css"
+                    pformat = b"file"
+                    pdir = "Styles"
+                    fname = "style" + nstr + ".css"
+
+            self.flows[j] = flowpart
+            self.flowinfo.append([ptype, pformat, pdir, fname])
+
+        if self.DEBUG:
+            logger.debug("\nFlow Map:  %d entries" % len(self.flowinfo))
+            for fi in self.flowinfo:
+                logger.debug(fi)
+            logger.debug("\n")
+
+            logger.debug(
+                "\nXHTML File Part Position Information: %d entries"
+                % len(self.partinfo)
+            )
+            for pi in self.partinfo:
+                logger.debug(pi)
+
+        if False:  # self.Debug:
+            # dump all of the locations of the aid tags used in TEXT
+            # find id links only inside of tags
+            #    inside any < > pair find all "aid=' and return whatever is inside the quotes
+            #    [^>]* means match any amount of chars except for  '>' char
+            #    [^'"] match any amount of chars except for the quote character
+            #    \s* means match any amount of whitespace
+            logger.debug("\npositions of all aid= pieces")
+            id_pattern = re.compile(
+                br"""<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>""", re.IGNORECASE
+            )
+            for m in re.finditer(id_pattern, rawML):
+                [filename, partnum, start, end] = self.getFileInfo(m.start())
+                [seqnum, idtext] = self.getFragTblInfo(m.start())
+                value = fromBase32(m.group(1))
+                logger.debug(
+                    "  aid: %s value: %d at: %d -> part: %d, start: %d, end: %d"
+                    % (m.group(1), value, m.start(), partnum, start, end)
+                )
+                logger.debug("       %s  fragtbl entry %d" % (idtext, seqnum))
+
+        return
+
+    # get information fragment table entry by pos
+    def getFragTblInfo(self, pos):
+        for j in range(len(self.fragtbl)):
+            [insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
+            if pos >= insertpos and pos < (insertpos + length):
+                # why are these "in: and before: added here
+                return seqnum, b"in: " + idtext
+            if pos < insertpos:
+                return seqnum, b"before: " + idtext
+        return None, None
+
+    # get information about the part (file) that exists at pos in original rawML
+    def getFileInfo(self, pos):
+        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
+            if pos >= start and pos < end:
+                return filename, partnum, start, end
+        return None, None, None, None
+
+    # accessor functions to properly protect the internal structure
+    def getNumberOfParts(self):
+        return len(self.parts)
+
+    def getPart(self, i):
+        if i >= 0 and i < len(self.parts):
+            return self.parts[i]
+        return None
+
+    def getPartInfo(self, i):
+        if i >= 0 and i < len(self.partinfo):
+            return self.partinfo[i]
+        return None
+
+    def getNumberOfFlows(self):
+        return len(self.flows)
+
+    def getFlow(self, i):
+        # note flows[0] is empty - it was all of the original text
+        if i > 0 and i < len(self.flows):
+            return self.flows[i]
+        return None
+
+    def getFlowInfo(self, i):
+        # note flowinfo[0] is empty - it was all of the original text
+        if i > 0 and i < len(self.flowinfo):
+            return self.flowinfo[i]
+        return None
+
+    def getIDTagByPosFid(self, posfid, offset):
+        # first convert kindle:pos:fid and offset info to position in file
+        # (fromBase32 can handle both string types on input)
+        row = fromBase32(posfid)
+        off = fromBase32(offset)
+        [insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
+        pos = insertpos + off
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if fname is None:
+            # pos does not exist
+            # default to skeleton pos instead
+            print(
+                "Link To Position", pos, "does not exist, retargeting to top of target"
+            )
+            pos = self.skeltbl[filenum][3]
+            fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        # an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
+        # Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
+        # some position information encoded into Base32 name.
+        # so find the closest "id=" before position the file  by actually searching in that file
+        idtext = self.getIDTag(pos)
+        return fname, idtext
+
+    def getIDTag(self, pos):
+        # find the first tag with a named anchor (name or id attribute) before pos
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if pn is None and skelpos is None:
+            logger.debug("Error: getIDTag - no file contains %s" % pos)
+        textblock = self.parts[pn]
+        npos = pos - skelpos
+        # if npos inside a tag then search all text before the its end of tag marker
+        pgt = textblock.find(b">", npos)
+        plt = textblock.find(b"<", npos)
+        if plt == npos or pgt < plt:
+            npos = pgt + 1
+        # find id and name attributes only inside of tags
+        # use a reverse tag search since that is faster
+        #    inside any < > pair find "id=" and "name=" attributes return it
+        #    [^>]* means match any amount of chars except for  '>' char
+        #    [^'"] match any amount of chars except for the quote character
+        #    \s* means match any amount of whitespace
+        textblock = textblock[0:npos]
+        id_pattern = re.compile(
+            br"""<[^>]*\sid\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
+        )
+        name_pattern = re.compile(
+            br"""<[^>]*\sname\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
+        )
+        aid_pattern = re.compile(br"""<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]""")
+        for tag in reverse_tag_iter(textblock):
+            # any ids in the body should default to top of file
+            if tag[0:6] == b"<body ":
+                return b""
+            if tag[0:6] != b"<meta ":
+                m = id_pattern.match(tag) or name_pattern.match(tag)
+                if m is not None:
+                    return m.group(1)
+                m = aid_pattern.match(tag)
+                if m is not None:
+                    self.linked_aids.add(m.group(1))
+                    return b"aid-" + m.group(1)
+        return b""
+
+    # do we need to do deep copying
+    def setParts(self, parts):
+        assert len(parts) == len(self.parts)
+        for i in range(len(parts)):
+            self.parts[i] = parts[i]
+
+    # do we need to do deep copying
+    def setFlows(self, flows):
+        assert len(flows) == len(self.flows)
+        for i in range(len(flows)):
+            self.flows[i] = flows[i]
+
+    # get information about the part (file) that exists at pos in original rawML
+    def getSkelInfo(self, pos):
+        for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
+            if pos >= start and pos < end:
+                return [partnum, pdir, filename, start, end, aidtext]
+        return [None, None, None, None, None, None]
+
+    # fileno is actually a reference into fragtbl (a fragment)
+    def getGuideText(self):
+        guidetext = b""
+        for [ref_type, ref_title, fileno] in self.guidetbl:
+            if ref_type == b"thumbimagestandard":
+                continue
+            if ref_type not in _guide_types and not ref_type.startswith(b"other."):
+                if ref_type == b"start":
+                    ref_type = b"text"
+                else:
+                    ref_type = b"other." + ref_type
+            [pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
+            [pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
+            idtext = self.getIDTag(pos)
+            linktgt = filename.encode("utf-8")
+            if idtext != b"":
+                linktgt += b"#" + idtext
+            guidetext += (
+                b'<reference type="'
+                + ref_type
+                + b'" title="'
+                + ref_title
+                + b'" href="'
+                + utf8_str(pdir)
+                + b"/"
+                + linktgt
+                + b'" />\n'
+            )
+        # opf is encoded utf-8 so must convert any titles properly
+        guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
+        return guidetext
+
+    def getPageIDTag(self, pos):
+        # find the first tag with a named anchor (name or id attribute) before pos
+        # but page map offsets need to little more leeway so if the offset points
+        # into a tag look for the next ending tag "/>" or "</" and start your search from there.
+        fname, pn, skelpos, skelend = self.getFileInfo(pos)
+        if pn is None and skelpos is None:
+            logger.debug("Error: getIDTag - no file contains %s" % pos)
+        textblock = self.parts[pn]
+        npos = pos - skelpos
+        # if npos inside a tag then search all text before next ending tag
+        pgt = textblock.find(b">", npos)
+        plt = textblock.find(b"<", npos)
+        if plt == npos or pgt < plt:
+            # we are in a tag
+            # so find first ending tag
+            pend1 = textblock.find(b"/>", npos)
+            pend2 = textblock.find(b"</", npos)
+            if pend1 != -1 and pend2 != -1:
+                pend = min(pend1, pend2)
+            else:
+                pend = max(pend1, pend2)
+            if pend != -1:
+                npos = pend
+            else:
+                npos = pgt + 1
+        # find id and name attributes only inside of tags
+        # use a reverse tag search since that is faster
+        #    inside any < > pair find "id=" and "name=" attributes return it
+        #    [^>]* means match any amount of chars except for  '>' char
+        #    [^'"] match any amount of chars except for the quote character
+        #    \s* means match any amount of whitespace
+        textblock = textblock[0:npos]
+        id_pattern = re.compile(
+            br"""<[^>]*\sid\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
+        )
+        name_pattern = re.compile(
+            br"""<[^>]*\sname\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
+        )
+        for tag in reverse_tag_iter(textblock):
+            # any ids in the body should default to top of file
+            if tag[0:6] == b"<body ":
+                return b""
+            if tag[0:6] != b"<meta ":
+                m = id_pattern.match(tag) or name_pattern.match(tag)
+                if m is not None:
+                    return m.group(1)
+        return b""
--- a/mobiparse/mobi/mobi_k8resc.py
+++ b/mobiparse/mobi/mobi_k8resc.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
+""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
+
+if DEBUG_USE_ORDERED_DICTIONARY:
+    from collections import OrderedDict as dict_
+else:
+    dict_ = dict
+
+from .compatibility_utils import unicode_str
+from loguru import logger
+
+from .mobi_utils import fromBase32
+
+_OPF_PARENT_TAGS = [
+    "xml",
+    "package",
+    "metadata",
+    "dc-metadata",
+    "x-metadata",
+    "manifest",
+    "spine",
+    "tours",
+    "guide",
+]
+
+
+class K8RESCProcessor(object):
+    def __init__(self, data, debug=False):
+        self._debug = debug
+        self.resc = None
+        self.opos = 0
+        self.extrameta = []
+        self.cover_name = None
+        self.spine_idrefs = {}
+        self.spine_order = []
+        self.spine_pageattributes = {}
+        self.spine_ppd = None
+        # need3 indicate the book has fields which require epub3.
+        # but the estimation of the source epub version from the fields is difficult.
+        self.need3 = False
+        self.package_ver = None
+        self.extra_metadata = []
+        self.refines_metadata = []
+        self.extra_attributes = []
+        # get header
+        start_pos = data.find(b"<")
+        self.resc_header = data[:start_pos]
+        # get resc data length
+        start = self.resc_header.find(b"=") + 1
+        end = self.resc_header.find(b"&", start)
+        resc_size = 0
+        if end > 0:
+            resc_size = fromBase32(self.resc_header[start:end])
+        resc_rawbytes = len(data) - start_pos
+        if resc_rawbytes == resc_size:
+            self.resc_length = resc_size
+        else:
+            # Most RESC has a nul string at its tail but some do not.
+            end_pos = data.find(b"\x00", start_pos)
+            if end_pos < 0:
+                self.resc_length = resc_rawbytes
+            else:
+                self.resc_length = end_pos - start_pos
+        if self.resc_length != resc_size:
+            logger.debug(
+                "Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(
+                    self.resc_length, resc_size
+                )
+            )
+        # now parse RESC after converting it to unicode from utf-8
+        self.resc = unicode_str(data[start_pos : start_pos + self.resc_length])
+        self.parseData()
+
+    def prepend_to_spine(self, key, idref, linear, properties):
+        self.spine_order = [key] + self.spine_order
+        self.spine_idrefs[key] = idref
+        attributes = {}
+        if linear is not None:
+            attributes["linear"] = linear
+        if properties is not None:
+            attributes["properties"] = properties
+        self.spine_pageattributes[key] = attributes
+
+    # RESC tag iterator
+    def resc_tag_iter(self):
+        tcontent = last_tattr = None
+        prefix = [""]
+        while True:
+            text, tag = self.parseresc()
+            if text is None and tag is None:
+                break
+            if text is not None:
+                tcontent = text.rstrip(" \r\n")
+            else:  # we have a tag
+                ttype, tname, tattr = self.parsetag(tag)
+                if ttype == "begin":
+                    tcontent = None
+                    prefix.append(tname + ".")
+                    if tname in _OPF_PARENT_TAGS:
+                        yield "".join(prefix), tname, tattr, tcontent
+                    else:
+                        last_tattr = tattr
+                else:  # single or end
+                    if ttype == "end":
+                        prefix.pop()
+                        tattr = last_tattr
+                        last_tattr = None
+                        if tname in _OPF_PARENT_TAGS:
+                            tname += "-end"
+                    yield "".join(prefix), tname, tattr, tcontent
+                    tcontent = None
+
+    # now parse the RESC to extract spine and extra metadata info
+    def parseData(self):
+        for prefix, tname, tattr, tcontent in self.resc_tag_iter():
+            if self._debug:
+                logger.debug(
+                    "   Parsing RESC: %s %s %s %s" % (prefix, tname, tattr, tcontent)
+                )
+            if tname == "package":
+                self.package_ver = tattr.get("version", "2.0")
+                package_prefix = tattr.get("prefix", "")
+                if self.package_ver.startswith("3") or package_prefix.startswith(
+                    "rendition"
+                ):
+                    self.need3 = True
+            if tname == "spine":
+                self.spine_ppd = tattr.get("page-progession-direction", None)
+                if self.spine_ppd is not None and self.spine_ppd == "rtl":
+                    self.need3 = True
+            if tname == "itemref":
+                skelid = tattr.pop("skelid", None)
+                if skelid is None and len(self.spine_order) == 0:
+                    # assume it was removed initial coverpage
+                    skelid = "coverpage"
+                    tattr["linear"] = "no"
+                self.spine_order.append(skelid)
+                idref = tattr.pop("idref", None)
+                if idref is not None:
+                    idref = "x_" + idref
+                self.spine_idrefs[skelid] = idref
+                if "id" in tattr:
+                    del tattr["id"]
+                # tattr["id"] = 'x_' + tattr["id"]
+                if "properties" in tattr:
+                    self.need3 = True
+                self.spine_pageattributes[skelid] = tattr
+            if tname == "meta" or tname.startswith("dc:"):
+                if "refines" in tattr or "property" in tattr:
+                    self.need3 = True
+                if tattr.get("name", "") == "cover":
+                    cover_name = tattr.get("content", None)
+                    if cover_name is not None:
+                        cover_name = "x_" + cover_name
+                    self.cover_name = cover_name
+                else:
+                    self.extrameta.append([tname, tattr, tcontent])
+
+    # parse and return either leading text or the next tag
+    def parseresc(self):
+        p = self.opos
+        if p >= len(self.resc):
+            return None, None
+        if self.resc[p] != "<":
+            res = self.resc.find("<", p)
+            if res == -1:
+                res = len(self.resc)
+            self.opos = res
+            return self.resc[p:res], None
+        # handle comment as a special case
+        if self.resc[p : p + 4] == "<!--":
+            te = self.resc.find("-->", p + 1)
+            if te != -1:
+                te = te + 2
+        else:
+            te = self.resc.find(">", p + 1)
+            ntb = self.resc.find("<", p + 1)
+            if ntb != -1 and ntb < te:
+                self.opos = ntb
+                return self.resc[p:ntb], None
+        self.opos = te + 1
+        return None, self.resc[p : te + 1]
+
+    # parses tag to identify:  [tname, ttype, tattr]
+    #    tname: tag name
+    #    ttype: tag type ('begin', 'end' or 'single');
+    #    tattr: dictionary of tag atributes
+    def parsetag(self, s):
+        p = 1
+        tname = None
+        ttype = None
+        tattr = dict_()
+        while s[p : p + 1] == " ":
+            p += 1
+        if s[p : p + 1] == "/":
+            ttype = "end"
+            p += 1
+            while s[p : p + 1] == " ":
+                p += 1
+        b = p
+        while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"):
+            p += 1
+        tname = s[b:p].lower()
+        # some special cases
+        if tname == "?xml":
+            tname = "xml"
+        if tname == "!--":
+            ttype = "single"
+            comment = s[p:-3].strip()
+            tattr["comment"] = comment
+        if ttype is None:
+            # parse any attributes of begin or single tags
+            while s.find("=", p) != -1:
+                while s[p : p + 1] == " ":
+                    p += 1
+                b = p
+                while s[p : p + 1] != "=":
+                    p += 1
+                aname = s[b:p].lower()
+                aname = aname.rstrip(" ")
+                p += 1
+                while s[p : p + 1] == " ":
+                    p += 1
+                if s[p : p + 1] in ('"', "'"):
+                    p = p + 1
+                    b = p
+                    while s[p : p + 1] not in ('"', "'"):
+                        p += 1
+                    val = s[b:p]
+                    p += 1
+                else:
+                    b = p
+                    while s[p : p + 1] not in (">", "/", " "):
+                        p += 1
+                    val = s[b:p]
+                tattr[aname] = val
+        if ttype is None:
+            ttype = "begin"
+            if s.find("/", p) >= 0:
+                ttype = "single"
+        return ttype, tname, tattr
+
+    def taginfo_toxml(self, taginfo):
+        res = []
+        tname, tattr, tcontent = taginfo
+        res.append("<" + tname)
+        if tattr is not None:
+            for key in tattr:
+                res.append(" " + key + '="' + tattr[key] + '"')
+        if tcontent is not None:
+            res.append(">" + tcontent + "</" + tname + ">\n")
+        else:
+            res.append("/>\n")
+        return "".join(res)
+
+    def hasSpine(self):
+        return len(self.spine_order) > 0
+
+    def needEPUB3(self):
+        return self.need3
+
+    def hasRefines(self):
+        for [tname, tattr, tcontent] in self.extrameta:
+            if "refines" in tattr:
+                return True
+        return False
+
+    def createMetadata(self, epubver):
+        for taginfo in self.extrameta:
+            tname, tattr, tcontent = taginfo
+            if "refines" in tattr:
+                if epubver == "F" and "property" in tattr:
+                    attr = ' id="%s" opf:%s="%s"\n' % (
+                        tattr["refines"],
+                        tattr["property"],
+                        tcontent,
+                    )
+                    self.extra_attributes.append(attr)
+                else:
+                    tag = self.taginfo_toxml(taginfo)
+                    self.refines_metadata.append(tag)
+            else:
+                tag = self.taginfo_toxml(taginfo)
+                self.extra_metadata.append(tag)
--- a/mobiparse/mobi/mobi_nav.py
+++ b/mobiparse/mobi/mobi_nav.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import unicode_str
+import os
+from .unipath import pathof
+from loguru import logger
+
+import re
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+DEBUG_NAV = False
+
+FORCE_DEFAULT_TITLE = False
+""" Set to True to force to use the default title. """
+
+NAVIGATION_FINENAME = "nav.xhtml"
+""" The name for the navigation document. """
+
+DEFAULT_TITLE = "Navigation"
+""" The default title for the navigation document. """
+
+
+class NAVProcessor(object):
+    def __init__(self, files):
+        self.files = files
+        self.navname = NAVIGATION_FINENAME
+
+    def buildLandmarks(self, guidetext):
+        header = ""
+        header += '  <nav epub:type="landmarks" id="landmarks" hidden="">\n'
+        header += "    <h2>Guide</h2>\n"
+        header += "    <ol>\n"
+        element = '      <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n'
+        footer = ""
+        footer += "    </ol>\n"
+        footer += "  </nav>\n"
+
+        type_map = {
+            "cover": "cover",
+            "title-page": "title-page",
+            # ?: 'frontmatter',
+            "text": "bodymatter",
+            # ?: 'backmatter',
+            "toc": "toc",
+            "loi": "loi",
+            "lot": "lot",
+            "preface": "preface",
+            "bibliography": "bibliography",
+            "index": "index",
+            "glossary": "glossary",
+            "acknowledgements": "acknowledgements",
+            "colophon": None,
+            "copyright-page": None,
+            "dedication": None,
+            "epigraph": None,
+            "foreword": None,
+            "notes": None,
+        }
+
+        re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
+        re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
+        re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
+        dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace("\\", "/")
+
+        data = ""
+        references = re.findall(r"<reference\s+.*?>", unicode_str(guidetext), re.I)
+        for reference in references:
+            mo_type = re_type.search(reference)
+            mo_title = re_title.search(reference)
+            mo_link = re_link.search(reference)
+            if mo_type is not None:
+                type_ = type_map.get(mo_type.group(1), None)
+            else:
+                type_ = None
+            if mo_title is not None:
+                title = mo_title.group(1)
+            else:
+                title = None
+            if mo_link is not None:
+                link = mo_link.group(1)
+            else:
+                link = None
+
+            if type_ is not None and title is not None and link is not None:
+                link = os.path.relpath(link, dir_).replace("\\", "/")
+                data += element.format(type_, link, title)
+        if len(data) > 0:
+            return header + data + footer
+        else:
+            return ""
+
+    def buildTOC(self, indx_data):
+        header = ""
+        header += '  <nav epub:type="toc" id="toc">\n'
+        header += "    <h1>Table of contents</h1>\n"
+        footer = "  </nav>\n"
+
+        # recursive part
+        def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
+            if start > len(indx_data) or end > len(indx_data):
+                logger.debug(
+                    "Warning (in buildTOC): missing INDX child entries",
+                    start,
+                    end,
+                    len(indx_data),
+                )
+                return ""
+            if DEBUG_NAV:
+                logger.debug(
+                    "recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end)
+                )
+            xhtml = ""
+            if start <= 0:
+                start = 0
+            if end <= 0:
+                end = len(indx_data)
+            if lvl > max_lvl:
+                max_lvl = lvl
+
+            indent1 = "  " * (2 + lvl * 2)
+            indent2 = "  " * (3 + lvl * 2)
+            xhtml += indent1 + "<ol>\n"
+            for i in range(start, end):
+                e = indx_data[i]
+                htmlfile = e["filename"]
+                desttag = e["idtag"]
+                text = e["text"]
+                if not e["hlvl"] == lvl:
+                    continue
+                num += 1
+                if desttag == "":
+                    link = htmlfile
+                else:
+                    link = "{:s}#{:s}".format(htmlfile, desttag)
+                xhtml += indent2 + "<li>"
+                entry = '<a href="{:}">{:s}</a>'.format(link, text)
+                xhtml += entry
+                # recurs
+                if e["child1"] >= 0:
+                    xhtml += "\n"
+                    xhtmlrec, max_lvl, num = recursINDX(
+                        max_lvl, num, lvl + 1, e["child1"], e["childn"] + 1
+                    )
+                    xhtml += xhtmlrec
+                    xhtml += indent2
+                # close entry
+                xhtml += "</li>\n"
+            xhtml += indent1 + "</ol>\n"
+            return xhtml, max_lvl, num
+
+        data, max_lvl, num = recursINDX()
+        if not len(indx_data) == num:
+            logger.debug(
+                "Warning (in buildTOC): different number of entries in NCX",
+                len(indx_data),
+                num,
+            )
+        return header + data + footer
+
+    def buildNAV(self, ncx_data, guidetext, title, lang):
+        logger.debug("Building Navigation Document.")
+        if FORCE_DEFAULT_TITLE:
+            title = DEFAULT_TITLE
+        nav_header = ""
+        nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
+        nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"'
+        nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"'
+        nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang)
+        nav_header += "<head>\n<title>{:s}</title>\n".format(title)
+        nav_header += '<meta charset="UTF-8" />\n'
+        nav_header += '<style type="text/css">\n'
+        nav_header += "nav#landmarks { display:none; }\n"
+        nav_header += "</style>\n</head>\n<body>\n"
+        nav_footer = "</body>\n</html>\n"
+
+        landmarks = self.buildLandmarks(guidetext)
+        toc = self.buildTOC(ncx_data)
+
+        data = nav_header
+        data += landmarks
+        data += toc
+        data += nav_footer
+        return data
+
+    def getNAVName(self):
+        return self.navname
+
+    def writeNAV(self, ncx_data, guidetext, metadata):
+        # build the xhtml
+        # logger.debug("Write Navigation Document.")
+        xhtml = self.buildNAV(
+            ncx_data, guidetext, metadata.get("Title")[0], metadata.get("Language")[0]
+        )
+        fname = os.path.join(self.files.k8text, self.navname)
+        with open(pathof(fname), "wb") as f:
+            f.write(xhtml.encode("utf-8"))
--- a/mobiparse/mobi/mobi_ncx.py
+++ b/mobiparse/mobi/mobi_ncx.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import os
+from .unipath import pathof
+from loguru import logger
+
+
+import re
+import json
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+'''
+NCX (Navigation Control for XML applications) is a generalized navigation definition DTD for application
+to Digital Talking Books, eBooks, and general web content models.                                                
+This DTD is an XML application that layers navigation functionality on top of SMIL 2.0  content.                                       
+The NCX defines a navigation path/model that may be applied upon existing publications,
+without modification of the existing publication source, so long as the navigation targets within
+the source publication can be directly referenced via a URI.                      
+
+http://www.daisy.org/z3986/2005/ncx-2005-1.dtd
+'''
+
+from .mobi_utils import toBase32
+from .mobi_index import MobiIndex
+
+DEBUG_NCX = False
+
+class ncxExtract:
+    def __init__(self, mh):
+        self.mh = mh
+        self.sect = self.mh.sect
+        self.isNCX = False
+        self.mi = MobiIndex(self.sect)
+        self.ncxidx = self.mh.ncxidx
+        self.indx_data = None
+
+    def parseNCX(self):
+        indx_data = []
+        tag_fieldname_map = {
+            1: ["pos", 0],
+            2: ["len", 0],
+            3: ["noffs", 0],
+            4: ["hlvl", 0],
+            5: ["koffs", 0],
+            6: ["pos_fid", 0],
+            21: ["parent", 0],
+            22: ["child1", 0],
+            23: ["childn", 0],
+        }
+        if self.ncxidx != 0xFFFFFFFF:
+            outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
+            if DEBUG_NCX:
+                logger.debug("ctoc_text {}".format(ctoc_text))
+                logger.debug("outtbl {}".format(outtbl))
+            num = 0
+            for [text, tagMap] in outtbl:
+                tmp = {
+                    "name": text.decode("utf-8"),
+                    "pos": -1,
+                    "len": 0,
+                    "noffs": -1,
+                    "text": "Unknown Text",
+                    "hlvl": -1,
+                    "kind": "Unknown Kind",
+                    "pos_fid": None,
+                    "parent": -1,
+                    "child1": -1,
+                    "childn": -1,
+                    "num": num,
+                }
+                for tag in tag_fieldname_map:
+                    [fieldname, i] = tag_fieldname_map[tag]
+                    if tag in tagMap:
+                        fieldvalue = tagMap[tag][i]
+                        if tag == 6:
+                            pos_fid = toBase32(fieldvalue, 4).decode("utf-8")
+                            fieldvalue2 = tagMap[tag][i + 1]
+                            pos_off = toBase32(fieldvalue2, 10).decode("utf-8")
+                            fieldvalue = "kindle:pos:fid:%s:off:%s" % (pos_fid, pos_off)
+                        tmp[fieldname] = fieldvalue
+                        if tag == 3:
+                            toctext = ctoc_text.get(fieldvalue, "Unknown Text")
+                            toctext = toctext.decode(self.mh.codec)
+                            tmp["text"] = toctext
+                        if tag == 5:
+                            kindtext = ctoc_text.get(fieldvalue, "Unknown Kind")
+                            kindtext = kindtext.decode(self.mh.codec)
+                            tmp["kind"] = kindtext
+                indx_data.append(tmp)
+
+                # CGDBG
+                '''
+                record number:  3
+                name:  03
+                position 461377  length:  465358  => position/150 = real page number
+                text:  第二章 青铜时代——单机游戏
+                kind:  Unknown Kind
+                heading level:  0 => level of section
+                parent: -1  => record number of previous level of section
+                first child:  15  last child:  26 => range of record number of next level section
+                pos_fid is  kindle:pos:fid:0023:off:0000000000
+                '''
+                if DEBUG_NCX:
+                    print("record number: ", num)
+                    print(
+                        "name: ", tmp["name"],
+                    )
+                    print("position", tmp["pos"], " length: ", tmp["len"])
+                    print("text: ", tmp["text"])
+                    print("kind: ", tmp["kind"])
+                    print("heading level: ", tmp["hlvl"])
+                    print("parent:", tmp["parent"])
+                    print(
+                        "first child: ", tmp["child1"], " last child: ", tmp["childn"]
+                    )
+                    print("pos_fid is ", tmp["pos_fid"])
+                    print("\n\n")
+                num += 1
+        self.indx_data = indx_data
+
+        # {'name': '00', 'pos': 167, 'len': 24798, 'noffs': 0, 'text': '版权信息', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 0}
+        # {'name': '0B', 'pos': 67932, 'len': 3274, 'noffs': 236, 'text': '8.希罗多德', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 11}
+        #print('indx_data {}'.format(json.dumps(indx_data, indent=4, sort_keys=True, ensure_ascii=False)))
+
+        return indx_data
+
--- a/mobiparse/mobi/mobi_opf.py
+++ b/mobiparse/mobi/mobi_opf.py
@@ -0,0 +1,828 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import unicode_str, unescapeit
+from .compatibility_utils import lzip
+from loguru import logger
+
+from .unipath import pathof
+
+from xml.sax.saxutils import escape as xmlescape
+
+import os
+import uuid
+from datetime import datetime
+
+# In EPUB3, NCX and <guide> MAY exist in OPF, although the NCX is superseded
+# by the Navigation Document and the <guide> is deprecated. Currently, EPUB3_WITH_NCX
+# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems.
+# They might be change to set to False in the future.
+
+EPUB3_WITH_NCX = True  # Do not set to False except for debug.
+""" Set to True to create a toc.ncx when converting to epub3. """
+
+EPUB3_WITH_GUIDE = True  # Do not set to False except for debug.
+""" Set to True to create a guide element in an opf when converting to epub3. """
+
+EPUB_OPF = "content.opf"
+""" The name for the OPF of EPUB. """
+
+TOC_NCX = "toc.ncx"
+""" The name for the TOC of EPUB2. """
+
+NAVIGATION_DOCUMENT = "nav.xhtml"
+""" The name for the navigation document of EPUB3. """
+
+BEGIN_INFO_ONLY = "<!-- BEGIN INFORMATION ONLY "
+""" The comment to indicate the beginning of metadata which will be ignored by kindlegen. """
+
+END_INFO_ONLY = "END INFORMATION ONLY -->"
+""" The comment to indicate the end of metadata which will be ignored by kindlegen. """
+
+EXTH_TITLE_FURIGANA = "Title-Pronunciation"
+""" The name for Title Furigana(similar to file-as) set by KDP. """
+
+EXTH_CREATOR_FURIGANA = "Author-Pronunciation"
+""" The name for Creator Furigana(similar to file-as) set by KDP. """
+
+EXTH_PUBLISHER_FURIGANA = "Publisher-Pronunciation"
+""" The name for Publisher Furigana(similar to file-as) set by KDP. """
+
+EXTRA_ENTITIES = {'"': "&quot;", "'": "&apos;"}
+
+
+class OPFProcessor(object):
+    def __init__(
+        self,
+        files,
+        metadata,
+        fileinfo,
+        rscnames,
+        hasNCX,
+        mh,
+        usedmap,
+        pagemapxml="",
+        guidetext="",
+        k8resc=None,
+        epubver="2",
+    ):
+        self.files = files
+        self.metadata = metadata
+        self.fileinfo = fileinfo
+        self.rscnames = rscnames
+        self.has_ncx = hasNCX
+        self.codec = mh.codec
+        self.isK8 = mh.isK8()
+        self.printReplica = mh.isPrintReplica()
+        self.guidetext = unicode_str(guidetext)
+        self.used = usedmap
+        self.k8resc = k8resc
+        self.covername = None
+        self.cover_id = "cover_img"
+        if self.k8resc is not None and self.k8resc.cover_name is not None:
+            # update cover id info from RESC if available
+            self.cover_id = self.k8resc.cover_name
+        # Create a unique urn uuid
+        self.BookId = unicode_str(str(uuid.uuid4()))
+        self.pagemap = pagemapxml
+
+        self.ncxname = None
+        self.navname = None
+
+        # page-progression-direction is only set in spine
+        self.page_progression_direction = metadata.pop(
+            "page-progression-direction", [None]
+        )[0]
+        if "rl" in metadata.get("primary-writing-mode", [""])[0]:
+            self.page_progression_direction = "rtl"
+        self.epubver = epubver  # the epub version set by user
+        self.target_epubver = (
+            epubver  # the epub vertion set by user or detected automatically
+        )
+        if self.epubver == "A":
+            self.target_epubver = self.autodetectEPUBVersion()
+        elif self.epubver == "F":
+            self.target_epubver = "2"
+        elif self.epubver != "2" and self.epubver != "3":
+            self.target_epubver = "2"
+
+        # id for rifine attributes
+        self.title_id = {}
+        self.creator_id = {}
+        self.publisher_id = {}
+        # extra attributes
+        self.title_attrib = {}
+        self.creator_attrib = {}
+        self.publisher_attrib = {}
+        self.extra_attributes = []  # for force epub2 option
+        # Create epub3 metadata from EXTH.
+        self.exth_solved_refines_metadata = []
+        self.exth_refines_metadata = []
+        self.exth_fixedlayout_metadata = []
+
+        self.defineRefinesID()
+        self.processRefinesMetadata()
+        if self.k8resc is not None:
+            # Create metadata in RESC section.
+            self.k8resc.createMetadata(epubver)
+        if self.target_epubver == "3":
+            self.createMetadataForFixedlayout()
+
+    def escapeit(self, sval, EXTRAS=None):
+        # note, xmlescape and unescape do not work with utf-8 bytestrings
+        sval = unicode_str(sval)
+        if EXTRAS:
+            res = xmlescape(unescapeit(sval), EXTRAS)
+        else:
+            res = xmlescape(unescapeit(sval))
+        return res
+
+    def createMetaTag(self, data, property, content, refid=""):
+        refines = ""
+        if refid:
+            refines = ' refines="#%s"' % refid
+        data.append('<meta property="%s"%s>%s</meta>\n' % (property, refines, content))
+
+    def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False):
+        # convert from EXTH metadata format to target epub version metadata
+        # epub 3 will ignore <meta name="xxxx" content="yyyy" /> style metatags
+        #    but allows them to be present for backwards compatibility
+        #    instead the new format is
+        #    <meta property="xxxx" id="iiii" ... > property_value</meta>
+        #       and DCMES elements such as:
+        #    <dc:blah id="iiii">value</dc:blah>
+
+        metadata = self.metadata
+        k8resc = self.k8resc
+
+        META_TAGS = [
+            "Drm Server Id",
+            "Drm Commerce Id",
+            "Drm Ebookbase Book Id",
+            "ASIN",
+            "ThumbOffset",
+            "Fake Cover",
+            "Creator Software",
+            "Creator Major Version",
+            "Creator Minor Version",
+            "Creator Build Number",
+            "Watermark",
+            "Clipping Limit",
+            "Publisher Limit",
+            "Text to Speech Disabled",
+            "CDE Type",
+            "Updated Title",
+            "Font Signature (hex)",
+            "Tamper Proof Keys (hex)",
+        ]
+
+        # def handleTag(data, metadata, key, tag, ids={}):
+        def handleTag(data, metadata, key, tag, attrib={}):
+            """Format metadata values.
+
+            @param data: List of formatted metadata entries.
+            @param metadata: The metadata dictionary.
+            @param key: The key of the metadata value to handle.
+            @param tag: The opf tag corresponds to the metadata value.
+            ###@param ids: The ids in tags for refines property of epub3.
+            @param attrib: The extra attibute for refines or opf prefixs.
+           """
+            if key in metadata:
+                for i, value in enumerate(metadata[key]):
+                    closingTag = tag.split(" ")[0]
+                    res = "<%s%s>%s</%s>\n" % (
+                        tag,
+                        attrib.get(i, ""),
+                        self.escapeit(value),
+                        closingTag,
+                    )
+                    data.append(res)
+                del metadata[key]
+
+        # these are allowed but ignored by epub3
+        def handleMetaPairs(data, metadata, key, name):
+            if key in metadata:
+                for value in metadata[key]:
+                    res = '<meta name="%s" content="%s" />\n' % (
+                        name,
+                        self.escapeit(value, EXTRA_ENTITIES),
+                    )
+                    data.append(res)
+                del metadata[key]
+
+        data = []
+        data.append(start_tag + "\n")
+        # Handle standard metadata
+        if "Title" in metadata:
+            handleTag(data, metadata, "Title", "dc:title", self.title_attrib)
+        else:
+            data.append("<dc:title>Untitled</dc:title>\n")
+        handleTag(data, metadata, "Language", "dc:language")
+        if "UniqueID" in metadata:
+            handleTag(data, metadata, "UniqueID", 'dc:identifier id="uid"')
+        else:
+            # No unique ID in original, give it a generic one.
+            data.append('<dc:identifier id="uid">0</dc:identifier>\n')
+
+        if self.target_epubver == "3":
+            # epub version 3 minimal metadata requires a dcterms:modifed date tag
+            self.createMetaTag(
+                data,
+                "dcterms:modified",
+                datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
+            )
+
+        if self.isK8 and has_obfuscated_fonts:
+            # Use the random generated urn:uuid so obuscated fonts work.
+            # It doesn't need to be _THE_ unique identifier to work as a key
+            # for obfuscated fonts in Sigil, ADE and calibre. Its just has
+            # to use the opf:scheme="UUID" and have the urn:uuid: prefix.
+            if self.target_epubver == "3":
+                data.append(
+                    "<dc:identifier>urn:uuid:" + self.BookId + "</dc:identifier>\n"
+                )
+            else:
+                data.append(
+                    '<dc:identifier opf:scheme="UUID">urn:uuid:'
+                    + self.BookId
+                    + "</dc:identifier>\n"
+                )
+
+        handleTag(data, metadata, "Creator", "dc:creator", self.creator_attrib)
+        handleTag(data, metadata, "Contributor", "dc:contributor")
+        handleTag(data, metadata, "Publisher", "dc:publisher", self.publisher_attrib)
+        handleTag(data, metadata, "Source", "dc:source")
+        handleTag(data, metadata, "Type", "dc:type")
+        if self.target_epubver == "3":
+            if "ISBN" in metadata:
+                for i, value in enumerate(metadata["ISBN"]):
+                    res = (
+                        "<dc:identifier>urn:isbn:%s</dc:identifier>\n"
+                        % self.escapeit(value)
+                    )
+                    data.append(res)
+        else:
+            handleTag(data, metadata, "ISBN", 'dc:identifier opf:scheme="ISBN"')
+        if "Subject" in metadata:
+            if "SubjectCode" in metadata:
+                codeList = metadata["SubjectCode"]
+                del metadata["SubjectCode"]
+            else:
+                codeList = None
+            for i in range(len(metadata["Subject"])):
+                if codeList and i < len(codeList):
+                    data.append('<dc:subject BASICCode="' + codeList[i] + '">')
+                else:
+                    data.append("<dc:subject>")
+                data.append(self.escapeit(metadata["Subject"][i]) + "</dc:subject>\n")
+            del metadata["Subject"]
+        handleTag(data, metadata, "Description", "dc:description")
+        if self.target_epubver == "3":
+            if "Published" in metadata:
+                for i, value in enumerate(metadata["Published"]):
+                    res = "<dc:date>%s</dc:date>\n" % self.escapeit(value)
+                    data.append(res)
+        else:
+            handleTag(data, metadata, "Published", 'dc:date opf:event="publication"')
+        handleTag(data, metadata, "Rights", "dc:rights")
+
+        if self.epubver == "F":
+            if self.extra_attributes or k8resc is not None and k8resc.extra_attributes:
+                data.append(
+                    "<!-- THE FOLLOWINGS ARE REQUIRED TO INSERT INTO <dc:xxx> MANUALLY\n"
+                )
+                if self.extra_attributes:
+                    data += self.extra_attributes
+                if k8resc is not None and k8resc.extra_attributes:
+                    data += k8resc.extra_attributes
+                data.append("-->\n")
+        else:
+            # Append refines metadata.
+            if self.exth_solved_refines_metadata:
+                data.append("<!-- Refines MetaData from EXTH -->\n")
+                data += self.exth_solved_refines_metadata
+            if (
+                self.exth_refines_metadata
+                or k8resc is not None
+                and k8resc.refines_metadata
+            ):
+                data.append("<!-- THE FOLLOWINGS ARE REQUIRED TO EDIT IDS MANUALLY\n")
+                if self.exth_refines_metadata:
+                    data += self.exth_refines_metadata
+                if k8resc is not None and k8resc.refines_metadata:
+                    data += k8resc.refines_metadata
+                data.append("-->\n")
+
+        # Append metadata in RESC section.
+        if k8resc is not None and k8resc.extra_metadata:
+            data.append("<!-- Extra MetaData from RESC\n")
+            data += k8resc.extra_metadata
+            data.append("-->\n")
+
+        if "CoverOffset" in metadata:
+            imageNumber = int(metadata["CoverOffset"][0])
+            self.covername = self.rscnames[imageNumber]
+            if self.covername is None:
+                logger.debug(
+                    "Error: Cover image %s was not recognized as a valid image"
+                    % imageNumber
+                )
+            else:
+                # <meta name="cover"> is obsoleted in EPUB3, but kindlegen v2.9 requires it.
+                data.append('<meta name="cover" content="' + self.cover_id + '" />\n')
+                self.used[self.covername] = "used"
+            del metadata["CoverOffset"]
+
+        handleMetaPairs(data, metadata, "Codec", "output encoding")
+        # handle kindlegen specifc tags
+        handleTag(data, metadata, "DictInLanguage", "DictionaryInLanguage")
+        handleTag(data, metadata, "DictOutLanguage", "DictionaryOutLanguage")
+        handleMetaPairs(data, metadata, "RegionMagnification", "RegionMagnification")
+        handleMetaPairs(data, metadata, "book-type", "book-type")
+        handleMetaPairs(data, metadata, "zero-gutter", "zero-gutter")
+        handleMetaPairs(data, metadata, "zero-margin", "zero-margin")
+        handleMetaPairs(data, metadata, "primary-writing-mode", "primary-writing-mode")
+        handleMetaPairs(data, metadata, "fixed-layout", "fixed-layout")
+        handleMetaPairs(data, metadata, "orientation-lock", "orientation-lock")
+        handleMetaPairs(data, metadata, "original-resolution", "original-resolution")
+
+        # these are not allowed in epub2 or 3 so convert them to meta name content pairs
+        # perhaps these could better be mapped into the dcterms namespace instead
+        handleMetaPairs(data, metadata, "Review", "review")
+        handleMetaPairs(data, metadata, "Imprint", "imprint")
+        handleMetaPairs(data, metadata, "Adult", "adult")
+        handleMetaPairs(data, metadata, "DictShortName", "DictionaryVeryShortName")
+
+        # these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3
+        if "Price" in metadata and "Currency" in metadata:
+            priceList = metadata["Price"]
+            currencyList = metadata["Currency"]
+            if len(priceList) != len(currencyList):
+                logger.debug("Error: found %s price entries, but %s currency entries.")
+            else:
+                for i in range(len(priceList)):
+                    data.append(
+                        '<SRP Currency="'
+                        + currencyList[i]
+                        + '">'
+                        + priceList[i]
+                        + "</SRP>\n"
+                    )
+            del metadata["Price"]
+            del metadata["Currency"]
+
+        if self.target_epubver == "3":
+            # Append metadata for EPUB3.
+            if self.exth_fixedlayout_metadata:
+                data.append("<!-- EPUB3 MedaData converted from EXTH -->\n")
+                data += self.exth_fixedlayout_metadata
+
+        # all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs
+        # so it can not impact anything and will be automatically stripped out if found again in a RESC section
+        data.append(BEGIN_INFO_ONLY + "\n")
+        if "ThumbOffset" in metadata:
+            imageNumber = int(metadata["ThumbOffset"][0])
+            imageName = self.rscnames[imageNumber]
+            if imageName is None:
+                logger.debug(
+                    "Error: Cover Thumbnail image %s was not recognized as a valid image"
+                    % imageNumber
+                )
+            else:
+                data.append(
+                    '<meta name="Cover ThumbNail Image" content="'
+                    + "Images/"
+                    + imageName
+                    + '" />\n'
+                )
+                # self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest
+                self.used[imageName] = "not used"
+            del metadata["ThumbOffset"]
+        for metaName in META_TAGS:
+            if metaName in metadata:
+                for value in metadata[metaName]:
+                    data.append(
+                        '<meta name="'
+                        + metaName
+                        + '" content="'
+                        + self.escapeit(value, EXTRA_ENTITIES)
+                        + '" />\n'
+                    )
+                del metadata[metaName]
+        for key in list(metadata.keys()):
+            for value in metadata[key]:
+                data.append(
+                    '<meta name="'
+                    + key
+                    + '" content="'
+                    + self.escapeit(value, EXTRA_ENTITIES)
+                    + '" />\n'
+                )
+            del metadata[key]
+        data.append(END_INFO_ONLY + "\n")
+        data.append("</metadata>\n")
+        return data
+
+    def buildOPFManifest(self, ncxname, navname=None):
+        # buildManifest for mobi7, azw4, epub2 and epub3.
+        k8resc = self.k8resc
+        cover_id = self.cover_id
+        hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
+        self.ncxname = ncxname
+        self.navname = navname
+
+        data = []
+        data.append("<manifest>\n")
+        media_map = {
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".png": "image/png",
+            ".gif": "image/gif",
+            ".svg": "image/svg+xml",
+            ".xhtml": "application/xhtml+xml",
+            ".html": "text/html",  # for mobi7
+            ".pdf": "application/pdf",  # for azw4(print replica textbook)
+            ".ttf": "application/x-font-ttf",
+            ".otf": "application/x-font-opentype",  # replaced?
+            ".css": "text/css",
+            # '.html' : 'text/x-oeb1-document',        # for mobi7
+            # '.otf'  : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts
+            # '.woff' : 'application/font-woff',       # [WOFF] WOFF fonts
+            # '.smil' : 'application/smil+xml',        # [MediaOverlays301] EPUB Media Overlay documents
+            # '.pls'  : 'application/pls+xml',         # [PLS] Text-to-Speech (TTS) Pronunciation lexicons
+            # '.mp3'  : 'audio/mpeg',
+            # '.mp4'  : 'video/mp4',
+            # '.js'   : 'text/javascript',             # not supported in K8
+        }
+        spinerefs = []
+
+        idcnt = 0
+        for [key, dir, fname] in self.fileinfo:
+            name, ext = os.path.splitext(fname)
+            ext = ext.lower()
+            media = media_map.get(ext)
+            ref = "item%d" % idcnt
+            if hasK8RescSpine:
+                if key is not None and key in k8resc.spine_idrefs:
+                    ref = k8resc.spine_idrefs[key]
+            properties = ""
+            if dir != "":
+                fpath = dir + "/" + fname
+            else:
+                fpath = fname
+            data.append(
+                '<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(
+                    ref, media, fpath, properties
+                )
+            )
+
+            if ext in [".xhtml", ".html"]:
+                spinerefs.append(ref)
+            idcnt += 1
+
+        for fname in self.rscnames:
+            if fname is not None:
+                if self.used.get(fname, "not used") == "not used":
+                    continue
+                name, ext = os.path.splitext(fname)
+                ext = ext.lower()
+                media = media_map.get(ext, ext[1:])
+                properties = ""
+                if fname == self.covername:
+                    ref = cover_id
+                    if self.target_epubver == "3":
+                        properties = 'properties="cover-image"'
+                else:
+                    ref = "item%d" % idcnt
+                if ext == ".ttf" or ext == ".otf":
+                    if self.isK8:  # fonts are only used in Mobi 8
+                        fpath = "Fonts/" + fname
+                        data.append(
+                            '<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(
+                                ref, media, fpath, properties
+                            )
+                        )
+                else:
+                    fpath = "Images/" + fname
+                    data.append(
+                        '<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(
+                            ref, media, fpath, properties
+                        )
+                    )
+                idcnt += 1
+
+        if self.target_epubver == "3" and navname is not None:
+            data.append(
+                '<item id="nav" media-type="application/xhtml+xml" href="Text/'
+                + navname
+                + '" properties="nav"/>\n'
+            )
+        if self.has_ncx and ncxname is not None:
+            data.append(
+                '<item id="ncx" media-type="application/x-dtbncx+xml" href="'
+                + ncxname
+                + '" />\n'
+            )
+        if self.pagemap != "":
+            data.append(
+                '<item id="map" media-type="application/oebs-page-map+xml" href="page-map.xml" />\n'
+            )
+        data.append("</manifest>\n")
+        return [data, spinerefs]
+
+    def buildOPFSpine(self, spinerefs, isNCX):
+        # build spine
+        k8resc = self.k8resc
+        hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
+        data = []
+        ppd = ""
+        if self.isK8 and self.page_progression_direction is not None:
+            ppd = ' page-progression-direction="{:s}"'.format(
+                self.page_progression_direction
+            )
+        ncx = ""
+        if isNCX:
+            ncx = ' toc="ncx"'
+        map = ""
+        if self.pagemap != "":
+            map = ' page-map="map"'
+        if self.epubver == "F":
+            if ppd:
+                ppd = "<!--" + ppd + " -->"
+            spine_start_tag = "<spine{1:s}{2:s}>{0:s}\n".format(ppd, map, ncx)
+        else:
+            spine_start_tag = "<spine{0:s}{1:s}{2:s}>\n".format(ppd, map, ncx)
+        data.append(spine_start_tag)
+
+        if hasK8RescSpine:
+            for key in k8resc.spine_order:
+                idref = k8resc.spine_idrefs[key]
+                attribs = k8resc.spine_pageattributes[key]
+                tag = '<itemref idref="%s"' % idref
+                for aname, val in list(attribs.items()):
+                    if self.epubver == "F" and aname == "properties":
+                        continue
+                    if val is not None:
+                        tag += ' %s="%s"' % (aname, val)
+                tag += "/>"
+                if self.epubver == "F" and "properties" in attribs:
+                    val = attribs["properties"]
+                    if val is not None:
+                        tag += '<!-- properties="%s" -->' % val
+                tag += "\n"
+                data.append(tag)
+        else:
+            start = 0
+            # special case the created coverpage if need be
+            [key, dir, fname] = self.fileinfo[0]
+            if key is not None and key == "coverpage":
+                entry = spinerefs[start]
+                data.append('<itemref idref="%s" linear="no"/>\n' % entry)
+                start += 1
+            for entry in spinerefs[start:]:
+                data.append('<itemref idref="' + entry + '"/>\n')
+        data.append("</spine>\n")
+        return data
+
+    def buildMobi7OPF(self):
+        # Build an OPF for mobi7 and azw4.
+        logger.debug("Building an opf for mobi7/azw4.")
+        data = []
+        data.append('<?xml version="1.0" encoding="utf-8"?>\n')
+        data.append(
+            '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n'
+        )
+        metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
+        opf_metadata = self.buildOPFMetadata(metadata_tag)
+        data += opf_metadata
+        if self.has_ncx:
+            # ncxname = self.files.getInputFileBasename() + '.ncx'
+            ncxname = "toc.ncx"
+        else:
+            ncxname = None
+        [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname)
+        data += opf_manifest
+        opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx)
+        data += opf_spine
+        data.append("<tours>\n</tours>\n")
+        if not self.printReplica:
+            guide = "<guide>\n" + self.guidetext + "</guide>\n"
+            data.append(guide)
+        data.append("</package>\n")
+        return "".join(data)
+
+    def buildEPUBOPF(self, has_obfuscated_fonts=False):
+        logger.debug(
+            "Building an opf for mobi8 using epub version: %s" % self.target_epubver
+        )
+        if self.target_epubver == "2":
+            has_ncx = self.has_ncx
+            has_guide = True
+            ncxname = None
+            ncxname = TOC_NCX
+            navname = None
+            package = '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n'
+            tours = "<tours>\n</tours>\n"
+            metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
+        else:
+            has_ncx = EPUB3_WITH_NCX
+            has_guide = EPUB3_WITH_GUIDE
+            ncxname = None
+            if has_ncx:
+                ncxname = TOC_NCX
+            navname = NAVIGATION_DOCUMENT
+            package = '<package version="3.0" xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.idpf.org/vocab/rendition/#" unique-identifier="uid">\n'
+            tours = ""
+            metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">'
+
+        data = []
+        data.append('<?xml version="1.0" encoding="utf-8"?>\n')
+        data.append(package)
+        opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts)
+        data += opf_metadata
+        [opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname)
+        data += opf_manifest
+        opf_spine = self.buildOPFSpine(spinerefs, has_ncx)
+        data += opf_spine
+        data.append(tours)
+        if has_guide:
+            guide = "<guide>\n" + self.guidetext + "</guide>\n"
+            data.append(guide)
+        data.append("</package>\n")
+        return "".join(data)
+
+    def writeOPF(self, has_obfuscated_fonts=False):
+        if self.isK8:
+            data = self.buildEPUBOPF(has_obfuscated_fonts)
+            outopf = os.path.join(self.files.k8oebps, EPUB_OPF)
+            with open(pathof(outopf), "wb") as f:
+                f.write(data.encode("utf-8"))
+            return self.BookId
+        else:
+            data = self.buildMobi7OPF()
+            outopf = os.path.join(self.files.mobi7dir, "content.opf")
+            with open(pathof(outopf), "wb") as f:
+                f.write(data.encode("utf-8"))
+            return 0
+
+    def getBookId(self):
+        return self.BookId
+
+    def getNCXName(self):
+        return self.ncxname
+
+    def getNAVName(self):
+        return self.navname
+
+    def getEPUBVersion(self):
+        return self.target_epubver
+
+    def hasNCX(self):
+        return self.ncxname is not None and self.has_ncx
+
+    def hasNAV(self):
+        return self.navname is not None
+
+    def autodetectEPUBVersion(self):
+        # Determine EPUB version from metadata and RESC.
+        metadata = self.metadata
+        k8resc = self.k8resc
+        epubver = "2"
+        if "true" == metadata.get("fixed-layout", [""])[0].lower():
+            epubver = "3"
+        elif metadata.get("orientation-lock", [""])[0].lower() in [
+            "portrait",
+            "landscape",
+        ]:
+            epubver = "3"
+        elif self.page_progression_direction == "rtl":
+            epubver = "3"
+        elif EXTH_TITLE_FURIGANA in metadata:
+            epubver = "3"
+        elif EXTH_CREATOR_FURIGANA in metadata:
+            epubver = "3"
+        elif EXTH_PUBLISHER_FURIGANA in metadata:
+            epubver = "3"
+        elif k8resc is not None and k8resc.needEPUB3():
+            epubver = "3"
+        return epubver
+
+    def defineRefinesID(self):
+        # the following EXTH are set by KDP.
+        # 'Title_Furigana_(508)'
+        # 'Creator_Furigana_(517)',
+        # 'Publisher_Furigana_(522)'
+        # It is difficult to find correspondence between Title, Creator, Publisher
+        # and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522.
+        # It is also difficult to find correspondence between them and tags which have refine attributes in RESC.
+        # So editing manually is required.
+        metadata = self.metadata
+
+        needRefinesId = False
+        if self.k8resc is not None:
+            needRefinesId = self.k8resc.hasRefines()
+        # Create id for rifine attributes
+        if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and "Title" in metadata:
+            for i in range(len(metadata.get("Title"))):
+                self.title_id[i] = "title%02d" % (i + 1)
+
+        if (
+            needRefinesId or EXTH_CREATOR_FURIGANA in metadata
+        ) and "Creator" in metadata:
+            for i in range(len(metadata.get("Creator"))):
+                self.creator_id[i] = "creator%02d" % (i + 1)
+
+        if (
+            needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata
+        ) and "Publisher" in metadata:
+            for i in range(len(metadata.get("Publisher"))):
+                self.publisher_id[i] = "publisher%02d" % (i + 1)
+
+    def processRefinesMetadata(self):
+        # create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2.
+        metadata = self.metadata
+
+        refines_list = [
+            [EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, "title00"],
+            [EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, "creator00"],
+            [
+                EXTH_PUBLISHER_FURIGANA,
+                self.publisher_id,
+                self.publisher_attrib,
+                "publisher00",
+            ],
+        ]
+
+        create_refines_metadata = False
+        for EXTH in lzip(*refines_list)[0]:
+            if EXTH in metadata:
+                create_refines_metadata = True
+                break
+        if create_refines_metadata:
+            for [EXTH, id, attrib, defaultid] in refines_list:
+                if self.target_epubver == "3":
+                    for i, value in list(id.items()):
+                        attrib[i] = ' id="%s"' % value
+
+                    if EXTH in metadata:
+                        if len(metadata[EXTH]) == 1 and len(id) == 1:
+                            self.createMetaTag(
+                                self.exth_solved_refines_metadata,
+                                "file-as",
+                                metadata[EXTH][0],
+                                id[0],
+                            )
+                        else:
+                            for i, value in enumerate(metadata[EXTH]):
+                                self.createMetaTag(
+                                    self.exth_refines_metadata,
+                                    "file-as",
+                                    value,
+                                    id.get(i, defaultid),
+                                )
+                else:
+                    if EXTH in metadata:
+                        if len(metadata[EXTH]) == 1 and len(id) == 1:
+                            attr = ' opf:file-as="%s"' % metadata[EXTH][0]
+                            attrib[0] = attr
+                        else:
+                            for i, value in enumerate(metadata[EXTH]):
+                                attr = ' id="#%s" opf:file-as="%s"\n' % (
+                                    id.get(i, defaultid),
+                                    value,
+                                )
+                                self.extra_attributes.append(attr)
+
+    def createMetadataForFixedlayout(self):
+        # convert fixed layout to epub3 format if needed.
+        metadata = self.metadata
+
+        if "fixed-layout" in metadata:
+            fixedlayout = metadata["fixed-layout"][0]
+            content = {"true": "pre-paginated"}.get(fixedlayout.lower(), "reflowable")
+            self.createMetaTag(
+                self.exth_fixedlayout_metadata, "rendition:layout", content
+            )
+
+        if "orientation-lock" in metadata:
+            content = metadata["orientation-lock"][0].lower()
+            if content == "portrait" or content == "landscape":
+                self.createMetaTag(
+                    self.exth_fixedlayout_metadata, "rendition:orientation", content
+                )
+
+        # according to epub3 spec about correspondence with Amazon
+        # if 'original-resolution' is provided it needs to be converted to
+        # meta viewport property tag stored in the <head></head> of **each**
+        # xhtml page - so this tag would need to be handled by editing each part
+        # before reaching this routine
+        # we need to add support for this to the k8html routine
+        # if 'original-resolution' in metadata.keys():
+        #     resolution = metadata['original-resolution'][0].lower()
+        #     width, height = resolution.split('x')
+        #     if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0:
+        #         viewport = 'width=%s, height=%s' % (width, height)
+        #         self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport)
--- a/mobiparse/mobi/mobi_pagemap.py
+++ b/mobiparse/mobi/mobi_pagemap.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, unicode_str
+from loguru import logger
+
+if PY2:
+    range = xrange
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+import re
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+
+_TABLE = [
+    ("m", 1000),
+    ("cm", 900),
+    ("d", 500),
+    ("cd", 400),
+    ("c", 100),
+    ("xc", 90),
+    ("l", 50),
+    ("xl", 40),
+    ("x", 10),
+    ("ix", 9),
+    ("v", 5),
+    ("iv", 4),
+    ("i", 1),
+]
+
+
+def int_to_roman(i):
+    parts = []
+    num = i
+    for letter, value in _TABLE:
+        while value <= num:
+            num -= value
+            parts.append(letter)
+    return "".join(parts)
+
+
+def roman_to_int(s):
+    result = 0
+    rnstr = s
+    for letter, value in _TABLE:
+        while rnstr.startswith(letter):
+            result += value
+            rnstr = rnstr[len(letter) :]
+    return result
+
+
+_pattern = r"""\(([^\)]*)\)"""
+_tup_pattern = re.compile(_pattern, re.IGNORECASE)
+
+
+def _parseNames(numpages, data):
+    data = unicode_str(data)
+    pagenames = []
+    pageMap = ""
+    for i in range(numpages):
+        pagenames.append(None)
+    for m in re.finditer(_tup_pattern, data):
+        tup = m.group(1)
+        if pageMap != "":
+            pageMap += ","
+        pageMap += "(" + tup + ")"
+        spos, nametype, svalue = tup.split(",")
+        # print(spos, nametype, svalue)
+        if nametype == "a" or nametype == "r":
+            svalue = int(svalue)
+        spos = int(spos)
+        for i in range(spos - 1, numpages):
+            if nametype == "r":
+                pname = int_to_roman(svalue)
+                svalue += 1
+            elif nametype == "a":
+                pname = "%s" % svalue
+                svalue += 1
+            elif nametype == "c":
+                sp = svalue.find("|")
+                if sp == -1:
+                    pname = svalue
+                else:
+                    pname = svalue[0:sp]
+                    svalue = svalue[sp + 1 :]
+            else:
+                logger.debug("Error: unknown page numbering type %s" % nametype)
+            pagenames[i] = pname
+    return pagenames, pageMap
+
+
+class PageMapProcessor:
+    def __init__(self, mh, data):
+        self.mh = mh
+        self.data = data
+        self.pagenames = []
+        self.pageoffsets = []
+        self.pageMap = ""
+        self.pm_len = 0
+        self.pm_nn = 0
+        self.pn_bits = 0
+        self.pmoff = None
+        self.pmstr = ""
+        logger.debug("Extracting Page Map Information")
+        (rev_len,) = struct.unpack_from(b">L", self.data, 0x10)
+        # skip over header, revision string length data, and revision string
+        ptr = 0x14 + rev_len
+        pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(
+            b">4H", self.data, ptr
+        )
+        # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
+        self.pmstr = self.data[ptr + 8 : ptr + 8 + self.pm_len]
+        self.pmoff = self.data[ptr + 8 + self.pm_len :]
+        offsize = b">L"
+        offwidth = 4
+        if self.pm_bits == 16:
+            offsize = b">H"
+            offwidth = 2
+        ptr = 0
+        for i in range(self.pm_nn):
+            (od,) = struct.unpack_from(offsize, self.pmoff, ptr)
+            ptr += offwidth
+            self.pageoffsets.append(od)
+        self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
+
+    def getPageMap(self):
+        return self.pageMap
+
+    def getNames(self):
+        return self.pagenames
+
+    def getOffsets(self):
+        return self.pageoffsets
+
+    # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
+    def generateKF8PageMapXML(self, k8proc):
+        pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
+        for i in range(len(self.pagenames)):
+            pos = self.pageoffsets[i]
+            name = self.pagenames[i]
+            if name is not None and name != "":
+                [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
+                idtext = unicode_str(k8proc.getPageIDTag(pos))
+                linktgt = unicode_str(filename)
+                if idtext != "":
+                    linktgt += "#" + idtext
+                pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
+        pagemapxml += "</page-map>\n"
+        return pagemapxml
+
+    def generateAPNX(self, apnx_meta):
+        if apnx_meta["format"] == "MOBI_8":
+            content_header = (
+                '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}'
+                % apnx_meta
+            )
+        else:
+            content_header = (
+                '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}'
+                % apnx_meta
+            )
+        content_header = content_header.encode("utf-8")
+        page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
+        page_header = page_header.encode("utf-8")
+        apnx = struct.pack(b">H", 1) + struct.pack(b">H", 1)
+        apnx += struct.pack(b">I", 12 + len(content_header))
+        apnx += struct.pack(b">I", len(content_header))
+        apnx += content_header
+        apnx += struct.pack(b">H", 1)
+        apnx += struct.pack(b">H", len(page_header))
+        apnx += struct.pack(b">H", self.pm_nn)
+        apnx += struct.pack(b">H", 32)
+        apnx += page_header
+        for page in self.pageoffsets:
+            apnx += struct.pack(b">L", page)
+        return apnx
--- a/mobiparse/mobi/mobi_sectioner.py
+++ b/mobiparse/mobi/mobi_sectioner.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
+from loguru import logger
+
+import datetime
+
+if PY2:
+    range = xrange
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+import struct
+
+from .unipath import pathof
+
+DUMP = False
+""" Set to True to dump all possible information. """
+
+
+class unpackException(Exception):
+    pass
+
+
+def describe(data):
+    txtans = ""
+    hexans = hexlify(data)
+    for i in data:
+        if bord(i) < 32 or bord(i) > 127:
+            txtans += "?"
+        else:
+            txtans += bchar(i).decode("latin-1")
+    return '"' + txtans + '"' + " 0x" + hexans
+
+
+def datetimefrompalmtime(palmtime):
+    if palmtime > 0x7FFFFFFF:
+        pythondatetime = datetime.datetime(
+            year=1904, month=1, day=1
+        ) + datetime.timedelta(seconds=palmtime)
+    else:
+        pythondatetime = datetime.datetime(
+            year=1970, month=1, day=1
+        ) + datetime.timedelta(seconds=palmtime)
+    return pythondatetime
+
+
+class Sectionizer:
+    def __init__(self, filename):
+        self.data = b""
+        with open(pathof(filename), "rb") as f:
+            self.data = f.read()
+        self.palmheader = self.data[:78]
+        self.palmname = self.data[:32]
+        self.ident = self.palmheader[0x3C : 0x3C + 8]
+        # CG struct.unpack_from(fmt, buffer, offset=0)
+        (self.num_sections,) = struct.unpack_from(b">H", self.palmheader, 76)
+        self.filelength = len(self.data)
+
+        ## CGDBG ???
+        ## sectionsdata (9680, 0, 18618, 2, 22275, 4, 25504, 6, 28607, 8,...
+        sectionsdata = struct.unpack_from( bstr(">%dL" % (self.num_sections * 2)), self.data, 78
+        ) + (self.filelength, 0)
+
+        ## 所有section的offset和长度
+        # sectionsoffset (9680, 18618, 22275, 25504, 28607, ...
+        self.sectionoffsets = sectionsdata[::2]
+        # ectionattributes (0, 2, 4, 6, 8, ...
+        self.sectionattributes = sectionsdata[1::2]
+        self.sectiondescriptions = ["" for x in range(self.num_sections + 1)]
+        self.sectiondescriptions[-1] = "File Length Only"
+
+        # CGDBG upack_from 返回什么？tuple (,)
+        print( 'sectionsdata {} {}'.format(sectionsdata, bstr(">%dL" % (self.num_sections * 2))))
+        print( 'sectionsoffset {} \n sectionattributes {}'.format( self.sectionoffsets, self.sectionattributes ))
+        print( 'sectionsdescriptions {} '.format( self.sectiondescriptions))
+        print( bstr(">%dL" % (self.num_sections * 2) ) )
+        print( struct.unpack_from(bstr(">%dL" % (self.num_sections * 2)) , self.data, 78) )
+        print( (self.filelength, 0) )
+
+        return
+
+    # sections information
+    def dumpsectionsinfo(self):
+        logger.debug("Section     Offset  Length      UID Attribs Description")
+        for i in range(self.num_sections):
+            '''
+            logger.debug(
+                "{}  {}  {}  {}  {}  {}  {}\n".format( i, i,
+                    self.sectionoffsets[i],
+                    self.sectionoffsets[i + 1] - self.sectionoffsets[i],
+                    self.sectionattributes[i] & 0xFFFFFF,
+                    (self.sectionattributes[i] >> 24) & 0xFF,
+                    self.sectiondescriptions[i]))
+            '''
+            logger.debug(
+                "%3d %3X  0x%07X 0x%05X % 8d % 7d %s"
+                % (
+                    i,
+                    i,
+                    self.sectionoffsets[i],
+                    self.sectionoffsets[i + 1] - self.sectionoffsets[i],
+                    self.sectionattributes[i] & 0xFFFFFF,
+                    (self.sectionattributes[i] >> 24) & 0xFF,
+                    self.sectiondescriptions[i],
+                )
+            )
+        logger.debug(
+            "%3d %3X  0x%07X                          %s"
+            % (
+                self.num_sections,
+                self.num_sections,
+                self.sectionoffsets[self.num_sections],
+                self.sectiondescriptions[self.num_sections],
+            )
+        )
+
+    def setsectiondescription(self, section, description):
+        if section < len(self.sectiondescriptions):
+            self.sectiondescriptions[section] = description
+        else:
+            logger.debug(
+                "Section out of range: %d, description %s" % (section, description)
+            )
+
+    def dumppalmheader(self):
+        logger.debug("Palm Database Header")
+        logger.debug("Database name: " + repr(self.palmheader[:32]))
+        (dbattributes,) = struct.unpack_from(b">H", self.palmheader, 32)
+        logger.debug("Bitfield attributes: 0x%0X" % dbattributes,)
+        if dbattributes != 0:
+            print(" (",)
+            if dbattributes & 2:
+                print("Read-only; ",)
+            if dbattributes & 4:
+                print("Dirty AppInfoArea; ",)
+            if dbattributes & 8:
+                print("Needs to be backed up; ",)
+            if dbattributes & 16:
+                print("OK to install over newer; ",)
+            if dbattributes & 32:
+                print("Reset after installation; ",)
+            if dbattributes & 64:
+                print("No copying by PalmPilot beaming; ",)
+            print(")")
+        else:
+            print("")
+        logger.debug(
+            "File version: %d" % struct.unpack_from(b">H", self.palmheader, 34)[0]
+        )
+        (dbcreation,) = struct.unpack_from(b">L", self.palmheader, 36)
+        logger.debug(
+            "Creation Date: "
+            + str(datetimefrompalmtime(dbcreation))
+            + (" (0x%0X)" % dbcreation)
+        )
+        (dbmodification,) = struct.unpack_from(b">L", self.palmheader, 40)
+        logger.debug(
+            "Modification Date: "
+            + str(datetimefrompalmtime(dbmodification))
+            + (" (0x%0X)" % dbmodification)
+        )
+        (dbbackup,) = struct.unpack_from(b">L", self.palmheader, 44)
+        if dbbackup != 0:
+            logger.debug(
+                "Backup Date: "
+                + str(datetimefrompalmtime(dbbackup))
+                + (" (0x%0X)" % dbbackup)
+            )
+        logger.debug(
+            "Modification No.: %d" % struct.unpack_from(b">L", self.palmheader, 48)[0]
+        )
+        logger.debug(
+            "App Info offset: 0x%0X" % struct.unpack_from(b">L", self.palmheader, 52)[0]
+        )
+        logger.debug(
+            "Sort Info offset: 0x%0X"
+            % struct.unpack_from(b">L", self.palmheader, 56)[0]
+        )
+        logger.debug(
+            "Type/Creator: %s/%s"
+            % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68]))
+        )
+        logger.debug(
+            "Unique seed: 0x%0X" % struct.unpack_from(b">L", self.palmheader, 68)[0]
+        )
+        (expectedzero,) = struct.unpack_from(b">L", self.palmheader, 72)
+        if expectedzero != 0:
+            logger.debug(
+                "Should be zero but isn't: %d"
+                % struct.unpack_from(b">L", self.palmheader, 72)[0]
+            )
+        logger.debug(
+            "Number of sections: %d" % struct.unpack_from(b">H", self.palmheader, 76)[0]
+        )
+        return
+
+    def loadSection(self, section):
+        before, after = self.sectionoffsets[section : section + 2]
+        return self.data[before:after]
--- a/mobiparse/mobi/mobi_split.py
+++ b/mobiparse/mobi/mobi_split.py
@@ -0,0 +1,505 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+from loguru import logger
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+from .unipath import pathof
+
+
+# CG : reference https://wiki.mobileread.com/wiki/MOBI
+# important  pdb header offsets
+unique_id_seed = 68
+number_of_pdb_records = 76
+
+# important palmdoc header offsets
+book_length = 4
+book_record_count = 8
+first_pdb_record = 78
+
+# important rec0 offsets
+length_of_book = 4
+mobi_header_base = 16
+mobi_header_length = 20
+mobi_type = 24
+mobi_version = 36
+first_non_text = 80
+title_offset = 84
+first_resc_record = 108
+first_content_index = 192
+last_content_index = 194
+kf8_fdst_index = 192  # for KF8 mobi headers
+fcis_index = 200
+flis_index = 208
+srcs_index = 224
+srcs_count = 228
+primary_index = 244
+datp_index = 256
+huffoff = 112
+hufftbloff = 120
+
+
+def getint(datain, ofs, sz=b"L"):
+    (i,) = struct.unpack_from(b">" + sz, datain, ofs)
+    return i
+
+
+def writeint(datain, ofs, n, len=b"L"):
+    if len == b"L":
+        return datain[:ofs] + struct.pack(b">L", n) + datain[ofs + 4 :]
+    else:
+        return datain[:ofs] + struct.pack(b">H", n) + datain[ofs + 2 :]
+
+
+def getsecaddr(datain, secno):
+    nsec = getint(datain, number_of_pdb_records, b"H")
+    assert secno >= 0 & secno < nsec, "secno %d out of range (nsec=%d)" % (secno, nsec)
+    secstart = getint(datain, first_pdb_record + secno * 8)
+    if secno == nsec - 1:
+        secend = len(datain)
+    else:
+        secend = getint(datain, first_pdb_record + (secno + 1) * 8)
+    return secstart, secend
+
+
+def readsection(datain, secno):
+    secstart, secend = getsecaddr(datain, secno)
+    return datain[secstart:secend]
+
+
+def writesection(datain, secno, secdata):  # overwrite, accounting for different length
+    # dataout = deletesectionrange(datain,secno, secno)
+    # return insertsection(dataout, secno, secdata)
+    datalst = []
+    nsec = getint(datain, number_of_pdb_records, b"H")
+    zerosecstart, zerosecend = getsecaddr(datain, 0)
+    secstart, secend = getsecaddr(datain, secno)
+    dif = len(secdata) - (secend - secstart)
+    datalst.append(datain[:unique_id_seed])
+    datalst.append(struct.pack(b">L", 2 * nsec + 1))
+    datalst.append(datain[unique_id_seed + 4 : number_of_pdb_records])
+    datalst.append(struct.pack(b">H", nsec))
+    newstart = zerosecstart
+    for i in range(0, secno):
+        ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
+        datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
+    datalst.append(struct.pack(b">L", secstart) + struct.pack(b">L", (2 * secno)))
+    for i in range(secno + 1, nsec):
+        ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
+        ofs = ofs + dif
+        datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
+    lpad = newstart - (first_pdb_record + 8 * nsec)
+    if lpad > 0:
+        datalst.append(b"\0" * lpad)
+    datalst.append(datain[zerosecstart:secstart])
+    datalst.append(secdata)
+    datalst.append(datain[secend:])
+    dataout = b"".join(datalst)
+    return dataout
+
+
+def nullsection(datain, secno):  # make it zero-length without deleting it
+    datalst = []
+    nsec = getint(datain, number_of_pdb_records, b"H")
+    secstart, secend = getsecaddr(datain, secno)
+    zerosecstart, zerosecend = getsecaddr(datain, 0)
+    dif = secend - secstart
+    datalst.append(datain[:first_pdb_record])
+    for i in range(0, secno + 1):
+        ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
+        datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
+    for i in range(secno + 1, nsec):
+        ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
+        ofs = ofs - dif
+        datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
+    lpad = zerosecstart - (first_pdb_record + 8 * nsec)
+    if lpad > 0:
+        datalst.append(b"\0" * lpad)
+    datalst.append(datain[zerosecstart:secstart])
+    datalst.append(datain[secend:])
+    dataout = b"".join(datalst)
+    return dataout
+
+
+def deletesectionrange(datain, firstsec, lastsec):  # delete a range of sections
+    datalst = []
+    firstsecstart, firstsecend = getsecaddr(datain, firstsec)
+    lastsecstart, lastsecend = getsecaddr(datain, lastsec)
+    zerosecstart, zerosecend = getsecaddr(datain, 0)
+    dif = lastsecend - firstsecstart + 8 * (lastsec - firstsec + 1)
+    nsec = getint(datain, number_of_pdb_records, b"H")
+    datalst.append(datain[:unique_id_seed])
+    datalst.append(struct.pack(b">L", 2 * (nsec - (lastsec - firstsec + 1)) + 1))
+    datalst.append(datain[unique_id_seed + 4 : number_of_pdb_records])
+    datalst.append(struct.pack(b">H", nsec - (lastsec - firstsec + 1)))
+    newstart = zerosecstart - 8 * (lastsec - firstsec + 1)
+    for i in range(0, firstsec):
+        ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
+        ofs = ofs - 8 * (lastsec - firstsec + 1)
+        datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
+    for i in range(lastsec + 1, nsec):
+        ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
+        ofs = ofs - dif
+        flgval = 2 * (i - (lastsec - firstsec + 1))
+        datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
+    lpad = newstart - (first_pdb_record + 8 * (nsec - (lastsec - firstsec + 1)))
+    if lpad > 0:
+        datalst.append(b"\0" * lpad)
+    datalst.append(datain[zerosecstart:firstsecstart])
+    datalst.append(datain[lastsecend:])
+    dataout = b"".join(datalst)
+    return dataout
+
+
+def insertsection(datain, secno, secdata):  # insert a new section
+    datalst = []
+    nsec = getint(datain, number_of_pdb_records, b"H")
+    # print("inserting secno" , secno,  "into" ,nsec, "sections")
+    secstart, secend = getsecaddr(datain, secno)
+    zerosecstart, zerosecend = getsecaddr(datain, 0)
+    dif = len(secdata)
+    datalst.append(datain[:unique_id_seed])
+    datalst.append(struct.pack(b">L", 2 * (nsec + 1) + 1))
+    datalst.append(datain[unique_id_seed + 4 : number_of_pdb_records])
+    datalst.append(struct.pack(b">H", nsec + 1))
+    newstart = zerosecstart + 8
+    for i in range(0, secno):
+        ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
+        ofs += 8
+        datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
+    datalst.append(struct.pack(b">L", secstart + 8) + struct.pack(b">L", (2 * secno)))
+    for i in range(secno, nsec):
+        ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
+        ofs = ofs + dif + 8
+        flgval = 2 * (i + 1)
+        datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
+    lpad = newstart - (first_pdb_record + 8 * (nsec + 1))
+    if lpad > 0:
+        datalst.append(b"\0" * lpad)
+    datalst.append(datain[zerosecstart:secstart])
+    datalst.append(secdata)
+    datalst.append(datain[secstart:])
+    dataout = b"".join(datalst)
+    return dataout
+
+
+def insertsectionrange(
+    sectionsource, firstsec, lastsec, sectiontarget, targetsec
+):  # insert a range of sections
+    # print("inserting secno" , firstsec,  "to", lastsec, "into" ,targetsec, "sections")
+    # dataout = sectiontarget
+    # for idx in range(lastsec,firstsec-1,-1):
+    #    dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
+    # return dataout
+    datalst = []
+    nsec = getint(sectiontarget, number_of_pdb_records, b"H")
+    zerosecstart, zerosecend = getsecaddr(sectiontarget, 0)
+    insstart, nul = getsecaddr(sectiontarget, targetsec)
+    nins = lastsec - firstsec + 1
+    srcstart, nul = getsecaddr(sectionsource, firstsec)
+    nul, srcend = getsecaddr(sectionsource, lastsec)
+    newstart = zerosecstart + 8 * nins
+
+    datalst.append(sectiontarget[:unique_id_seed])
+    datalst.append(struct.pack(b">L", 2 * (nsec + nins) + 1))
+    datalst.append(sectiontarget[unique_id_seed + 4 : number_of_pdb_records])
+    datalst.append(struct.pack(b">H", nsec + nins))
+    for i in range(0, targetsec):
+        ofs, flgval = struct.unpack_from(
+            b">2L", sectiontarget, first_pdb_record + i * 8
+        )
+        ofsnew = ofs + 8 * nins
+        flgvalnew = flgval
+        datalst.append(struct.pack(b">L", ofsnew) + struct.pack(b">L", flgvalnew))
+        # print(ofsnew, flgvalnew, ofs, flgval)
+    srcstart0, nul = getsecaddr(sectionsource, firstsec)
+    for i in range(nins):
+        isrcstart, nul = getsecaddr(sectionsource, firstsec + i)
+        ofsnew = insstart + (isrcstart - srcstart0) + 8 * nins
+        flgvalnew = 2 * (targetsec + i)
+        datalst.append(struct.pack(b">L", ofsnew) + struct.pack(b">L", flgvalnew))
+        # print(ofsnew, flgvalnew)
+    dif = srcend - srcstart
+    for i in range(targetsec, nsec):
+        ofs, flgval = struct.unpack_from(
+            b">2L", sectiontarget, first_pdb_record + i * 8
+        )
+        ofsnew = ofs + dif + 8 * nins
+        flgvalnew = 2 * (i + nins)
+        datalst.append(struct.pack(b">L", ofsnew) + struct.pack(b">L", flgvalnew))
+        # print(ofsnew, flgvalnew, ofs, flgval)
+    lpad = newstart - (first_pdb_record + 8 * (nsec + nins))
+    if lpad > 0:
+        datalst.append(b"\0" * lpad)
+    datalst.append(sectiontarget[zerosecstart:insstart])
+    datalst.append(sectionsource[srcstart:srcend])
+    datalst.append(sectiontarget[insstart:])
+    dataout = b"".join(datalst)
+    return dataout
+
+
+def get_exth_params(rec0):
+    ebase = mobi_header_base + getint(rec0, mobi_header_length)
+    elen = getint(rec0, ebase + 4)
+    enum = getint(rec0, ebase + 8)
+    return ebase, elen, enum
+
+
+def add_exth(rec0, exth_num, exth_bytes):
+    ebase, elen, enum = get_exth_params(rec0)
+    newrecsize = 8 + len(exth_bytes)
+    newrec0 = (
+        rec0[0 : ebase + 4]
+        + struct.pack(b">L", elen + newrecsize)
+        + struct.pack(b">L", enum + 1)
+        + struct.pack(b">L", exth_num)
+        + struct.pack(b">L", newrecsize)
+        + exth_bytes
+        + rec0[ebase + 12 :]
+    )
+    newrec0 = writeint(
+        newrec0, title_offset, getint(newrec0, title_offset) + newrecsize
+    )
+    return newrec0
+
+
+def read_exth(rec0, exth_num):
+    exth_values = []
+    ebase, elen, enum = get_exth_params(rec0)
+    ebase = ebase + 12
+    while enum > 0:
+        exth_id = getint(rec0, ebase)
+        if exth_id == exth_num:
+            # We might have multiple exths, so build a list.
+            exth_values.append(rec0[ebase + 8 : ebase + getint(rec0, ebase + 4)])
+        enum = enum - 1
+        ebase = ebase + getint(rec0, ebase + 4)
+    return exth_values
+
+
+def write_exth(rec0, exth_num, exth_bytes):
+    ebase, elen, enum = get_exth_params(rec0)
+    ebase_idx = ebase + 12
+    enum_idx = enum
+    while enum_idx > 0:
+        exth_id = getint(rec0, ebase_idx)
+        if exth_id == exth_num:
+            dif = len(exth_bytes) + 8 - getint(rec0, ebase_idx + 4)
+            newrec0 = rec0
+            if dif != 0:
+                newrec0 = writeint(
+                    newrec0, title_offset, getint(newrec0, title_offset) + dif
+                )
+            return (
+                newrec0[: ebase + 4]
+                + struct.pack(
+                    b">L", elen + len(exth_bytes) + 8 - getint(rec0, ebase_idx + 4)
+                )
+                + struct.pack(b">L", enum)
+                + rec0[ebase + 12 : ebase_idx + 4]
+                + struct.pack(b">L", len(exth_bytes) + 8)
+                + exth_bytes
+                + rec0[ebase_idx + getint(rec0, ebase_idx + 4) :]
+            )
+        enum_idx = enum_idx - 1
+        ebase_idx = ebase_idx + getint(rec0, ebase_idx + 4)
+    return rec0
+
+
+def del_exth(rec0, exth_num):
+    ebase, elen, enum = get_exth_params(rec0)
+    ebase_idx = ebase + 12
+    enum_idx = 0
+    while enum_idx < enum:
+        exth_id = getint(rec0, ebase_idx)
+        exth_size = getint(rec0, ebase_idx + 4)
+        if exth_id == exth_num:
+            newrec0 = rec0
+            newrec0 = writeint(
+                newrec0, title_offset, getint(newrec0, title_offset) - exth_size
+            )
+            newrec0 = newrec0[:ebase_idx] + newrec0[ebase_idx + exth_size :]
+            newrec0 = (
+                newrec0[0 : ebase + 4]
+                + struct.pack(b">L", elen - exth_size)
+                + struct.pack(b">L", enum - 1)
+                + newrec0[ebase + 12 :]
+            )
+            return newrec0
+        enum_idx += 1
+        ebase_idx = ebase_idx + exth_size
+    return rec0
+
+
+class mobi_split:
+    def __init__(self, infile):
+        datain = b""
+        with open(pathof(infile), "rb") as f:
+            datain = f.read()
+        datain_rec0 = readsection(datain, 0)
+        ver = getint(datain_rec0, mobi_version)
+        self.combo = ver != 8
+        if not self.combo:
+            return
+        exth121 = read_exth(datain_rec0, 121)
+        if len(exth121) == 0:
+            self.combo = False
+            return
+        else:
+            # only pay attention to first exth121
+            # (there should only be one)
+            (datain_kf8,) = struct.unpack_from(b">L", exth121[0], 0)
+            if datain_kf8 == 0xFFFFFFFF:
+                self.combo = False
+                return
+        datain_kfrec0 = readsection(datain, datain_kf8)
+
+        # create the standalone mobi7
+        num_sec = getint(datain, number_of_pdb_records, b"H")
+        # remove BOUNDARY up to but not including ELF record
+        self.result_file7 = deletesectionrange(datain, datain_kf8 - 1, num_sec - 2)
+        # check if there are SRCS records and delete them
+        srcs = getint(datain_rec0, srcs_index)
+        num_srcs = getint(datain_rec0, srcs_count)
+        if srcs != 0xFFFFFFFF and num_srcs > 0:
+            self.result_file7 = deletesectionrange(
+                self.result_file7, srcs, srcs + num_srcs - 1
+            )
+            datain_rec0 = writeint(datain_rec0, srcs_index, 0xFFFFFFFF)
+            datain_rec0 = writeint(datain_rec0, srcs_count, 0)
+        # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
+        datain_rec0 = write_exth(datain_rec0, 121, struct.pack(b">L", 0xFFFFFFFF))
+        # datain_rec0 = del_exth(datain_rec0,121)
+        # datain_rec0 = del_exth(datain_rec0,534)
+        # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
+        # set the EXTH 129 KF8 Masthead / Cover Image string to the null string
+        datain_rec0 = write_exth(datain_rec0, 129, b"")
+        # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
+
+        # need to reset flags stored in 0x80-0x83
+        # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
+        # Bit Flags
+        # 0x1000 = Bit 12 indicates if embedded fonts are used or not
+        # 0x0800 = means this Header points to *shared* images/resource/fonts ??
+        # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
+        # 0x0040 = exth exists
+        # 0x0010 = Not sure but this is always set so far
+        (fval,) = struct.unpack_from(b">L", datain_rec0, 0x80)
+        # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
+        fval = fval & 0x07FF
+        datain_rec0 = datain_rec0[:0x80] + struct.pack(b">L", fval) + datain_rec0[0x84:]
+
+        self.result_file7 = writesection(self.result_file7, 0, datain_rec0)
+
+        # no need to replace kf8 style fcis with mobi 7 one
+        # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
+        # if fcis_secnum != 0xffffffff:
+        #     fcis_info = readsection(datain, fcis_secnum)
+        #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
+        #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+        #     new_fcis += struct.pack(b'>L',text_len)
+        #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+        #     self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
+
+        firstimage = getint(datain_rec0, first_resc_record)
+        lastimage = getint(datain_rec0, last_content_index, b"H")
+        # print("Old First Image, last Image", firstimage,lastimage)
+        if lastimage == 0xFFFF:
+            # find the lowest of the next sections and copy up to that.
+            ofs_list = [
+                (fcis_index, b"L"),
+                (flis_index, b"L"),
+                (datp_index, b"L"),
+                (hufftbloff, b"L"),
+            ]
+            for ofs, sz in ofs_list:
+                n = getint(datain_rec0, ofs, sz)
+                # print("n",n)
+                if n > 0 and n < lastimage:
+                    lastimage = n - 1
+        logger.debug("First Image, last Image %s %s" % (firstimage, lastimage))
+
+        # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
+        for i in range(firstimage, lastimage):
+            imgsec = readsection(self.result_file7, i)
+            if imgsec[0:4] in [b"RESC", b"FONT"]:
+                self.result_file7 = nullsection(self.result_file7, i)
+
+        # mobi7 finished
+
+        # create standalone mobi8
+        self.result_file8 = deletesectionrange(datain, 0, datain_kf8 - 1)
+        target = getint(datain_kfrec0, first_resc_record)
+        self.result_file8 = insertsectionrange(
+            datain, firstimage, lastimage, self.result_file8, target
+        )
+        datain_kfrec0 = readsection(self.result_file8, 0)
+
+        # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
+        kf8starts = read_exth(datain_kfrec0, 116)
+        # If we have multiple StartOffset, keep only the last one
+        kf8start_count = len(kf8starts)
+        while kf8start_count > 1:
+            kf8start_count -= 1
+            datain_kfrec0 = del_exth(datain_kfrec0, 116)
+
+        # update the EXTH 125 KF8 Count of Images/Fonts/Resources
+        datain_kfrec0 = write_exth(
+            datain_kfrec0, 125, struct.pack(b">L", lastimage - firstimage + 1)
+        )
+
+        # need to reset flags stored in 0x80-0x83
+        # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
+        # standalone mobi8 with exth: 0x0050
+        # Bit Flags
+        # 0x1000 = Bit 12 indicates if embedded fonts are used or not
+        # 0x0800 = means this Header points to *shared* images/resource/fonts ??
+        # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
+        # 0x0040 = exth exists
+        # 0x0010 = Not sure but this is always set so far
+        (fval,) = struct.unpack_from(">L", datain_kfrec0, 0x80)
+        fval = fval & 0x1FFF
+        fval |= 0x0800
+        datain_kfrec0 = (
+            datain_kfrec0[:0x80] + struct.pack(b">L", fval) + datain_kfrec0[0x84:]
+        )
+
+        # properly update other index pointers that have been shifted by the insertion of images
+        ofs_list = [
+            (kf8_fdst_index, b"L"),
+            (fcis_index, b"L"),
+            (flis_index, b"L"),
+            (datp_index, b"L"),
+            (hufftbloff, b"L"),
+        ]
+        for ofs, sz in ofs_list:
+            n = getint(datain_kfrec0, ofs, sz)
+            if n != 0xFFFFFFFF:
+                datain_kfrec0 = writeint(
+                    datain_kfrec0, ofs, n + lastimage - firstimage + 1, sz
+                )
+        self.result_file8 = writesection(self.result_file8, 0, datain_kfrec0)
+
+        # no need to replace kf8 style fcis with mobi 7 one
+        # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
+        # if fcis_secnum != 0xffffffff:
+        #     fcis_info = readsection(self.result_file8, fcis_secnum)
+        #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
+        #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
+        #     new_fcis += struct.pack(b'>L',text_len)
+        #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
+        #     self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
+
+        # mobi8 finished
+
+    def getResult8(self):
+        return self.result_file8
+
+    def getResult7(self):
+        return self.result_file7
--- a/mobiparse/mobi/mobi_uncompress.py
+++ b/mobiparse/mobi/mobi_uncompress.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, bchr, lmap, bstr
+
+if PY2:
+    range = xrange
+
+import struct
+
+# note:  struct pack, unpack, unpack_from all require bytestring format
+# data all the way up to at least python 2.7.5, python 3 okay with bytestring
+
+
+class unpackException(Exception):
+    pass
+
+
+class UncompressedReader:
+    def unpack(self, data):
+        return data
+
+
+class PalmdocReader:
+    def unpack(self, i):
+        o, p = b"", 0
+        while p < len(i):
+            # for python 3 must use slice since i[p] returns int while slice returns character
+            c = ord(i[p : p + 1])
+            p += 1
+            if c >= 1 and c <= 8:
+                o += i[p : p + c]
+                p += c
+            elif c < 128:
+                o += bchr(c)
+            elif c >= 192:
+                o += b" " + bchr(c ^ 128)
+            else:
+                if p < len(i):
+                    c = (c << 8) | ord(i[p : p + 1])
+                    p += 1
+                    m = (c >> 3) & 0x07FF
+                    n = (c & 7) + 3
+                    if m > n:
+                        o += o[-m : n - m]
+                    else:
+                        for _ in range(n):
+                            # because of completely ass-backwards decision by python mainters for python 3
+                            # we must use slice for bytes as i[p] returns int while slice returns character
+                            if m == 1:
+                                o += o[-m:]
+                            else:
+                                o += o[-m : -m + 1]
+        return o
+
+
+class HuffcdicReader:
+    q = struct.Struct(b">Q").unpack_from
+
+    def loadHuff(self, huff):
+        if huff[0:8] != b"HUFF\x00\x00\x00\x18":
+            raise unpackException("invalid huff header")
+        off1, off2 = struct.unpack_from(b">LL", huff, 8)
+
+        def dict1_unpack(v):
+            codelen, term, maxcode = v & 0x1F, v & 0x80, v >> 8
+            assert codelen != 0
+            if codelen <= 8:
+                assert term
+            maxcode = ((maxcode + 1) << (32 - codelen)) - 1
+            return (codelen, term, maxcode)
+
+        self.dict1 = lmap(dict1_unpack, struct.unpack_from(b">256L", huff, off1))
+
+        dict2 = struct.unpack_from(b">64L", huff, off2)
+        self.mincode, self.maxcode = (), ()
+        for codelen, mincode in enumerate((0,) + dict2[0::2]):
+            self.mincode += (mincode << (32 - codelen),)
+        for codelen, maxcode in enumerate((0,) + dict2[1::2]):
+            self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1,)
+
+        self.dictionary = []
+
+    def loadCdic(self, cdic):
+        if cdic[0:8] != b"CDIC\x00\x00\x00\x10":
+            raise unpackException("invalid cdic header")
+        phrases, bits = struct.unpack_from(b">LL", cdic, 8)
+        n = min(1 << bits, phrases - len(self.dictionary))
+        h = struct.Struct(b">H").unpack_from
+
+        def getslice(off):
+            (blen,) = h(cdic, 16 + off)
+            slice = cdic[18 + off : 18 + off + (blen & 0x7FFF)]
+            return (slice, blen & 0x8000)
+
+        self.dictionary += lmap(
+            getslice, struct.unpack_from(bstr(">%dH" % n), cdic, 16)
+        )
+
+    def unpack(self, data):
+        q = HuffcdicReader.q
+
+        bitsleft = len(data) * 8
+        data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
+        pos = 0
+        (x,) = q(data, pos)
+        n = 32
+
+        s = b""
+        while True:
+            if n <= 0:
+                pos += 4
+                (x,) = q(data, pos)
+                n += 32
+            code = (x >> n) & ((1 << 32) - 1)
+
+            codelen, term, maxcode = self.dict1[code >> 24]
+            if not term:
+                while code < self.mincode[codelen]:
+                    codelen += 1
+                maxcode = self.maxcode[codelen]
+
+            n -= codelen
+            bitsleft -= codelen
+            if bitsleft < 0:
+                break
+
+            r = (maxcode - code) >> (32 - codelen)
+            slice, flag = self.dictionary[r]
+            if not flag:
+                self.dictionary[r] = None
+                slice = self.unpack(slice)
+                self.dictionary[r] = (slice, 1)
+            s += slice
+        return s
--- a/mobiparse/mobi/mobi_utils.py
+++ b/mobiparse/mobi/mobi_utils.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+# flake8: noqa
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import PY2, text_type, bchr, bord
+
+import binascii
+
+if PY2:
+    range = xrange
+
+from itertools import cycle
+
+
+def getLanguage(langID, sublangID):
+    mobilangdict = {
+        54: {0: "af"},  # Afrikaans
+        28: {0: "sq"},  # Albanian
+        1: {
+            0: "ar",
+            5: "ar-dz",
+            15: "ar-bh",
+            3: "ar-eg",
+            2: "ar-iq",
+            11: "ar-jo",
+            13: "ar-kw",
+            12: "ar-lb",
+            4: "ar-ly",
+            6: "ar-ma",
+            8: "ar-om",
+            16: "ar-qa",
+            1: "ar-sa",
+            10: "ar-sy",
+            7: "ar-tn",
+            14: "ar-ae",
+            9: "ar-ye",
+        },
+        # Arabic,  Arabic (Algeria),  Arabic (Bahrain),  Arabic (Egypt),  Arabic
+        # (Iraq), Arabic (Jordan),  Arabic (Kuwait),  Arabic (Lebanon),  Arabic
+        # (Libya), Arabic (Morocco),  Arabic (Oman),  Arabic (Qatar),  Arabic
+        # (Saudi Arabia),  Arabic (Syria),  Arabic (Tunisia),  Arabic (United Arab
+        # Emirates),  Arabic (Yemen)
+        43: {0: "hy"},  # Armenian
+        77: {0: "as"},  # Assamese
+        44: {0: "az"},  # "Azeri (IANA: Azerbaijani)
+        45: {0: "eu"},  # Basque
+        35: {0: "be"},  # Belarusian
+        69: {0: "bn"},  # Bengali
+        2: {0: "bg"},  # Bulgarian
+        3: {0: "ca"},  # Catalan
+        4: {0: "zh", 3: "zh-hk", 2: "zh-cn", 4: "zh-sg", 1: "zh-tw"},
+        # Chinese,  Chinese (Hong Kong),  Chinese (PRC),  Chinese (Singapore),  Chinese (Taiwan)
+        26: {0: "hr", 3: "sr"},  # Croatian, Serbian
+        5: {0: "cs"},  # Czech
+        6: {0: "da"},  # Danish
+        19: {0: "nl", 1: "nl", 2: "nl-be"},  # Dutch / Flemish,  Dutch (Belgium)
+        9: {
+            0: "en",
+            1: "en",
+            3: "en-au",
+            40: "en-bz",
+            4: "en-ca",
+            6: "en-ie",
+            8: "en-jm",
+            5: "en-nz",
+            13: "en-ph",
+            7: "en-za",
+            11: "en-tt",
+            2: "en-gb",
+            1: "en-us",
+            12: "en-zw",
+        },
+        # English,  English (Australia),  English (Belize),  English (Canada),
+        # English (Ireland),  English (Jamaica),  English (New Zealand),  English
+        # (Philippines),  English (South Africa),  English (Trinidad),  English
+        # (United Kingdom),  English (United States),  English (Zimbabwe)
+        37: {0: "et"},  # Estonian
+        56: {0: "fo"},  # Faroese
+        41: {0: "fa"},  # Farsi / Persian
+        11: {0: "fi"},  # Finnish
+        12: {
+            0: "fr",
+            1: "fr",
+            2: "fr-be",
+            3: "fr-ca",
+            5: "fr-lu",
+            6: "fr-mc",
+            4: "fr-ch",
+        },
+        # French,  French (Belgium),  French (Canada),  French (Luxembourg),  French (Monaco),  French (Switzerland)
+        55: {0: "ka"},  # Georgian
+        7: {0: "de", 1: "de", 3: "de-at", 5: "de-li", 4: "de-lu", 2: "de-ch"},
+        # German,  German (Austria),  German (Liechtenstein),  German (Luxembourg),  German (Switzerland)
+        8: {0: "el"},  # Greek, Modern (1453-)
+        71: {0: "gu"},  # Gujarati
+        13: {0: "he"},  # Hebrew (also code 'iw'?)
+        57: {0: "hi"},  # Hindi
+        14: {0: "hu"},  # Hungarian
+        15: {0: "is"},  # Icelandic
+        33: {0: "id"},  # Indonesian
+        16: {0: "it", 1: "it", 2: "it-ch"},  # Italian,  Italian (Switzerland)
+        17: {0: "ja"},  # Japanese
+        75: {0: "kn"},  # Kannada
+        63: {0: "kk"},  # Kazakh
+        87: {0: "x-kok"},  # Konkani (real language code is 'kok'?)
+        18: {0: "ko"},  # Korean
+        38: {0: "lv"},  # Latvian
+        39: {0: "lt"},  # Lithuanian
+        47: {0: "mk"},  # Macedonian
+        62: {0: "ms"},  # Malay
+        76: {0: "ml"},  # Malayalam
+        58: {0: "mt"},  # Maltese
+        78: {0: "mr"},  # Marathi
+        97: {0: "ne"},  # Nepali
+        20: {0: "no"},  # Norwegian
+        72: {0: "or"},  # Oriya
+        21: {0: "pl"},  # Polish
+        22: {0: "pt", 2: "pt", 1: "pt-br"},  # Portuguese,  Portuguese (Brazil)
+        70: {0: "pa"},  # Punjabi
+        23: {0: "rm"},  # "Rhaeto-Romanic" (IANA: Romansh)
+        24: {0: "ro"},  # Romanian
+        25: {0: "ru"},  # Russian
+        59: {0: "sz"},  # "Sami (Lappish)" (not an IANA language code)
+        # IANA code for "Northern Sami" is 'se'
+        # 'SZ' is the IANA region code for Swaziland
+        79: {0: "sa"},  # Sanskrit
+        27: {0: "sk"},  # Slovak
+        36: {0: "sl"},  # Slovenian
+        46: {0: "sb"},  # "Sorbian" (not an IANA language code)
+        # 'SB' is IANA region code for 'Solomon Islands'
+        # Lower Sorbian = 'dsb'
+        # Upper Sorbian = 'hsb'
+        # Sorbian Languages = 'wen'
+        10: {
+            0: "es",
+            4: "es",
+            44: "es-ar",
+            64: "es-bo",
+            52: "es-cl",
+            36: "es-co",
+            20: "es-cr",
+            28: "es-do",
+            48: "es-ec",
+            68: "es-sv",
+            16: "es-gt",
+            72: "es-hn",
+            8: "es-mx",
+            76: "es-ni",
+            24: "es-pa",
+            60: "es-py",
+            40: "es-pe",
+            80: "es-pr",
+            56: "es-uy",
+            32: "es-ve",
+        },
+        # Spanish,  Spanish (Mobipocket bug?),  Spanish (Argentina),  Spanish
+        # (Bolivia),  Spanish (Chile),  Spanish (Colombia),  Spanish (Costa Rica),
+        # Spanish (Dominican Republic),  Spanish (Ecuador),  Spanish (El
+        # Salvador),  Spanish (Guatemala),  Spanish (Honduras),  Spanish (Mexico),
+        # Spanish (Nicaragua),  Spanish (Panama),  Spanish (Paraguay),  Spanish
+        # (Peru),  Spanish (Puerto Rico),  Spanish (Uruguay),  Spanish (Venezuela)
+        48: {0: "sx"},  # "Sutu" (not an IANA language code)
+        # "Sutu" is another name for "Southern Sotho"?
+        # IANA code for "Southern Sotho" is 'st'
+        65: {0: "sw"},  # Swahili
+        29: {0: "sv", 1: "sv", 8: "sv-fi"},  # Swedish,  Swedish (Finland)
+        73: {0: "ta"},  # Tamil
+        68: {0: "tt"},  # Tatar
+        74: {0: "te"},  # Telugu
+        30: {0: "th"},  # Thai
+        49: {0: "ts"},  # Tsonga
+        50: {0: "tn"},  # Tswana
+        31: {0: "tr"},  # Turkish
+        34: {0: "uk"},  # Ukrainian
+        32: {0: "ur"},  # Urdu
+        67: {0: "uz", 2: "uz"},  # Uzbek
+        42: {0: "vi"},  # Vietnamese
+        52: {0: "xh"},  # Xhosa
+        53: {0: "zu"},  # Zulu
+    }
+    lang = "en"
+    if langID in mobilangdict:
+        subdict = mobilangdict[langID]
+        lang = subdict[0]
+        if sublangID in subdict:
+            lang = subdict[sublangID]
+    return lang
+
+
+def toHex(byteList):
+    return binascii.hexlify(byteList)
+
+
+# returns base32 bytestring
+def toBase32(value, npad=4):
+    digits = b"0123456789ABCDEFGHIJKLMNOPQRSTUV"
+    num_string = b""
+    current = value
+    while current != 0:
+        next, remainder = divmod(current, 32)
+        rem_string = digits[remainder : remainder + 1]
+        num_string = rem_string + num_string
+        current = next
+    if num_string == b"":
+        num_string = b"0"
+    pad = npad - len(num_string)
+    if pad > 0:
+        num_string = b"0" * pad + num_string
+    return num_string
+
+
+# converts base32 string to value
+def fromBase32(str_num):
+    if isinstance(str_num, text_type):
+        str_num = str_num.encode("latin-1")
+    scalelst = [1, 32, 1024, 32768, 1048576, 33554432, 1073741824, 34359738368]
+    value = 0
+    j = 0
+    n = len(str_num)
+    scale = 0
+    for i in range(n):
+        c = str_num[n - i - 1 : n - i]
+        if c in b"0123456789":
+            v = ord(c) - ord(b"0")
+        else:
+            v = ord(c) - ord(b"A") + 10
+        if j < len(scalelst):
+            scale = scalelst[j]
+        else:
+            scale = scale * 32
+        j += 1
+        if v != 0:
+            value = value + (v * scale)
+    return value
+
+
+# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
+# in place of ascii you will get a byte to half-word or integer
+# one to one mapping of values from 0 - 255
+
+
+def mangle_fonts(encryption_key, data):
+    if isinstance(encryption_key, text_type):
+        encryption_key = encryption_key.encode("latin-1")
+    crypt = data[:1024]
+    key = cycle(iter(map(bord, encryption_key)))
+    # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
+    encrypt = b"".join([bchr(bord(x) ^ next(key)) for x in crypt])
+    return encrypt + data[1024:]
--- a/mobiparse/mobi/mobiml2xhtml.py
+++ b/mobiparse/mobi/mobiml2xhtml.py
@@ -0,0 +1,585 @@
+#! /usr/bin/python
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+
+# this program works in concert with the output from KindleUnpack
+
+"""
+Convert from Mobi ML to XHTML
+"""
+
+import os
+import sys
+import re
+
+SPECIAL_HANDLING_TAGS = {
+    "?xml": ("xmlheader", -1),
+    "!--": ("comment", -3),
+    "!DOCTYPE": ("doctype", -1),
+}
+
+SPECIAL_HANDLING_TYPES = ["xmlheader", "doctype", "comment"]
+
+SELF_CLOSING_TAGS = [
+    "br",
+    "hr",
+    "input",
+    "img",
+    "image",
+    "meta",
+    "spacer",
+    "link",
+    "frame",
+    "base",
+    "col",
+    "reference",
+]
+
+
+class MobiMLConverter(object):
+
+    PAGE_BREAK_PAT = re.compile(r"(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+", re.IGNORECASE)
+    IMAGE_ATTRS = ("lowrecindex", "recindex", "hirecindex")
+
+    def __init__(self, filename):
+        self.base_css_rules = "blockquote { margin: 0em 0em 0em 1.25em }\n"
+        self.base_css_rules += "p { margin: 0em }\n"
+        self.base_css_rules += ".bold { font-weight: bold }\n"
+        self.base_css_rules += ".italic { font-style: italic }\n"
+        self.base_css_rules += (
+            ".mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n"
+        )
+        self.tag_css_rules = {}
+        self.tag_css_rule_cnt = 0
+        self.path = []
+        self.filename = filename
+        self.wipml = open(self.filename, "rb").read()
+        self.pos = 0
+        self.opfname = self.filename.rsplit(".", 1)[0] + ".opf"
+        self.opos = 0
+        self.meta = ""
+        self.cssname = os.path.join(os.path.dirname(self.filename), "styles.css")
+        self.current_font_size = 3
+        self.font_history = []
+
+    def cleanup_html(self):
+        self.wipml = re.sub(
+            r'<div height="0(pt|px|ex|em|%){0,1}"></div>', "", self.wipml
+        )
+        self.wipml = self.wipml.replace("\r\n", "\n")
+        self.wipml = self.wipml.replace("> <", ">\n<")
+        self.wipml = self.wipml.replace("<mbp: ", "<mbp:")
+        # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
+        self.wipml = self.wipml.replace("<br></br>", "<br/>")
+
+    def replace_page_breaks(self):
+        self.wipml = self.PAGE_BREAK_PAT.sub(
+            '<div class="mbp_pagebreak" />', self.wipml
+        )
+
+    # parse leading text of ml and tag
+    def parseml(self):
+        p = self.pos
+        if p >= len(self.wipml):
+            return None
+        if self.wipml[p] != "<":
+            res = self.wipml.find("<", p)
+            if res == -1:
+                res = len(self.wipml)
+            self.pos = res
+            return self.wipml[p:res], None
+        # handle comment as a special case to deal with multi-line comments
+        if self.wipml[p : p + 4] == "<!--":
+            te = self.wipml.find("-->", p + 1)
+            if te != -1:
+                te = te + 2
+        else:
+            te = self.wipml.find(">", p + 1)
+            ntb = self.wipml.find("<", p + 1)
+            if ntb != -1 and ntb < te:
+                self.pos = ntb
+                return self.wipml[p:ntb], None
+        self.pos = te + 1
+        return None, self.wipml[p : te + 1]
+
+    # parses string version of tag to identify its name,
+    # its type 'begin', 'end' or 'single',
+    # plus build a hashtable of its attributes
+    # code is written to handle the possiblity of very poor formating
+    def parsetag(self, s):
+        p = 1
+        # get the tag name
+        tname = None
+        ttype = None
+        tattr = {}
+        while s[p : p + 1] == " ":
+            p += 1
+        if s[p : p + 1] == "/":
+            ttype = "end"
+            p += 1
+            while s[p : p + 1] == " ":
+                p += 1
+        b = p
+        while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"):
+            p += 1
+        tname = s[b:p].lower()
+        if tname == "!doctype":
+            tname = "!DOCTYPE"
+        # special cases
+        if tname in SPECIAL_HANDLING_TAGS.keys():
+            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
+            tattr["special"] = s[p:backstep]
+        if ttype is None:
+            # parse any attributes
+            while s.find("=", p) != -1:
+                while s[p : p + 1] == " ":
+                    p += 1
+                b = p
+                while s[p : p + 1] != "=":
+                    p += 1
+                aname = s[b:p].lower()
+                aname = aname.rstrip(" ")
+                p += 1
+                while s[p : p + 1] == " ":
+                    p += 1
+                if s[p : p + 1] in ('"', "'"):
+                    p = p + 1
+                    b = p
+                    while s[p : p + 1] not in ('"', "'"):
+                        p += 1
+                    val = s[b:p]
+                    p += 1
+                else:
+                    b = p
+                    while s[p : p + 1] not in (">", "/", " "):
+                        p += 1
+                    val = s[b:p]
+                tattr[aname] = val
+        # label beginning and single tags
+        if ttype is None:
+            ttype = "begin"
+            if s.find(" /", p) >= 0:
+                ttype = "single_ext"
+            elif s.find("/", p) >= 0:
+                ttype = "single"
+        return ttype, tname, tattr
+
+    # main routine to convert from mobi markup language to html
+    def processml(self):
+
+        # are these really needed
+        html_done = False
+        head_done = False
+        body_done = False
+
+        skip = False
+
+        htmlstr = ""
+        self.replace_page_breaks()
+        self.cleanup_html()
+
+        # now parse the cleaned up ml into standard xhtml
+        while True:
+
+            r = self.parseml()
+            if not r:
+                break
+
+            text, tag = r
+
+            if text:
+                if not skip:
+                    htmlstr += text
+
+            if tag:
+                ttype, tname, tattr = self.parsetag(tag)
+
+                # If we run into a DTD or xml declarations inside the body ... bail.
+                if (
+                    tname in SPECIAL_HANDLING_TAGS.keys()
+                    and tname != "comment"
+                    and body_done
+                ):
+                    htmlstr += "\n</body></html>"
+                    break
+
+                # make sure self-closing tags actually self-close
+                if ttype == "begin" and tname in SELF_CLOSING_TAGS:
+                    ttype = "single"
+
+                # make sure any end tags of self-closing tags are discarded
+                if ttype == "end" and tname in SELF_CLOSING_TAGS:
+                    continue
+
+                # remove embedded guide and refernces from old mobis
+                if tname in ("guide", "ncx", "reference") and ttype in (
+                    "begin",
+                    "single",
+                    "single_ext",
+                ):
+                    tname = "removeme:{0}".format(tname)
+                    tattr = None
+                if (
+                    tname in ("guide", "ncx", "reference", "font", "span")
+                    and ttype == "end"
+                ):
+                    if self.path[-1] == "removeme:{0}".format(tname):
+                        tname = "removeme:{0}".format(tname)
+                        tattr = None
+
+                # Get rid of font tags that only have a color attribute.
+                if tname == "font" and ttype in ("begin", "single", "single_ext"):
+                    if "color" in tattr.keys() and len(tattr.keys()) == 1:
+                        tname = "removeme:{0}".format(tname)
+                        tattr = None
+
+                # Get rid of empty spans in the markup.
+                if (
+                    tname == "span"
+                    and ttype in ("begin", "single", "single_ext")
+                    and not len(tattr)
+                ):
+                    tname = "removeme:{0}".format(tname)
+
+                # need to handle fonts outside of the normal methods
+                # so fonts tags won't be added to the self.path since we keep track
+                # of font tags separately with self.font_history
+                if tname == "font" and ttype == "begin":
+                    # check for nested font start tags
+                    if len(self.font_history) > 0:
+                        # inject a font end tag
+                        taginfo = ("end", "font", None)
+                        htmlstr += self.processtag(taginfo)
+                    self.font_history.append((ttype, tname, tattr))
+                    # handle the current font start tag
+                    taginfo = (ttype, tname, tattr)
+                    htmlstr += self.processtag(taginfo)
+                    continue
+
+                # check for nested font tags and unnest them
+                if tname == "font" and ttype == "end":
+                    self.font_history.pop()
+                    # handle this font end tag
+                    taginfo = ("end", "font", None)
+                    htmlstr += self.processtag(taginfo)
+                    # check if we were nested
+                    if len(self.font_history) > 0:
+                        # inject a copy of the most recent font start tag from history
+                        taginfo = self.font_history[-1]
+                        htmlstr += self.processtag(taginfo)
+                    continue
+
+                # keep track of nesting path
+                if ttype == "begin":
+                    self.path.append(tname)
+                elif ttype == "end":
+                    if tname != self.path[-1]:
+                        print ("improper nesting: ", self.path, tname, ttype)
+                        if tname not in self.path:
+                            # handle case of end tag with no beginning by injecting empty begin tag
+                            taginfo = ("begin", tname, None)
+                            htmlstr += self.processtag(taginfo)
+                            print "     - fixed by injecting empty start tag ", tname
+                            self.path.append(tname)
+                        elif len(self.path) > 1 and tname == self.path[-2]:
+                            # handle case of dangling missing end
+                            taginfo = ("end", self.path[-1], None)
+                            htmlstr += self.processtag(taginfo)
+                            print "     - fixed by injecting end tag ", self.path[-1]
+                            self.path.pop()
+                    self.path.pop()
+
+                if tname == "removeme:{0}".format(tname):
+                    if ttype in ("begin", "single", "single_ext"):
+                        skip = True
+                    else:
+                        skip = False
+                else:
+                    taginfo = (ttype, tname, tattr)
+                    htmlstr += self.processtag(taginfo)
+
+                # handle potential issue of multiple html, head, and body sections
+                if tname == "html" and ttype == "begin" and not html_done:
+                    htmlstr += "\n"
+                    html_done = True
+
+                if tname == "head" and ttype == "begin" and not head_done:
+                    htmlstr += "\n"
+                    # also add in metadata and style link tags
+                    htmlstr += self.meta
+                    htmlstr += (
+                        '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+                    )
+                    head_done = True
+
+                if tname == "body" and ttype == "begin" and not body_done:
+                    htmlstr += "\n"
+                    body_done = True
+
+        # handle issue of possibly missing html, head, and body tags
+        # I have not seen this but the original did something like this so ...
+        if not body_done:
+            htmlstr = "<body>\n" + htmlstr + "</body>\n"
+        if not head_done:
+            headstr = "<head>\n"
+            headstr += self.meta
+            headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
+            headstr += "</head>\n"
+            htmlstr = headstr + htmlstr
+        if not html_done:
+            htmlstr = "<html>\n" + htmlstr + "</html>\n"
+
+        # finally add DOCTYPE info
+        htmlstr = (
+            '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+            + htmlstr
+        )
+
+        css = self.base_css_rules
+        for cls, rule in self.tag_css_rules.items():
+            css += ".%s { %s }\n" % (cls, rule)
+
+        return (htmlstr, css, self.cssname)
+
+    def ensure_unit(self, raw, unit="px"):
+        if re.search(r"\d+$", raw) is not None:
+            raw += unit
+        return raw
+
+    # flatten possibly modified tag back to string
+    def taginfo_tostring(self, taginfo):
+        (ttype, tname, tattr) = taginfo
+        if ttype is None or tname is None:
+            return ""
+        if ttype == "end":
+            return "</%s>" % tname
+        if (
+            ttype in SPECIAL_HANDLING_TYPES
+            and tattr is not None
+            and "special" in tattr.keys()
+        ):
+            info = tattr["special"]
+            if ttype == "comment":
+                return "<%s %s-->" % tname, info
+            else:
+                return "<%s %s>" % tname, info
+        res = []
+        res.append("<%s" % tname)
+        if tattr is not None:
+            for key in tattr.keys():
+                res.append(' %s="%s"' % (key, tattr[key]))
+        if ttype == "single":
+            res.append("/>")
+        elif ttype == "single_ext":
+            res.append(" />")
+        else:
+            res.append(">")
+        return "".join(res)
+
+    # routines to convert from mobi ml tags atributes to xhtml attributes and styles
+    def processtag(self, taginfo):
+        # Converting mobi font sizes to numerics
+        size_map = {
+            "xx-small": "1",
+            "x-small": "2",
+            "small": "3",
+            "medium": "4",
+            "large": "5",
+            "x-large": "6",
+            "xx-large": "7",
+        }
+
+        size_to_em_map = {
+            "1": ".65em",
+            "2": ".75em",
+            "3": "1em",
+            "4": "1.125em",
+            "5": "1.25em",
+            "6": "1.5em",
+            "7": "2em",
+        }
+
+        # current tag to work on
+        (ttype, tname, tattr) = taginfo
+        if not tattr:
+            tattr = {}
+
+        styles = []
+
+        if tname is None or tname.startswith("removeme"):
+            return ""
+
+        # have not seen an example of this yet so keep it here to be safe
+        # until this is better understood
+        if tname in (
+            "country-region",
+            "place",
+            "placetype",
+            "placename",
+            "state",
+            "city",
+            "street",
+            "address",
+            "content",
+        ):
+            tname = "div" if tname == "content" else "span"
+            for key in tattr.keys():
+                tattr.pop(key)
+
+        # handle general case of style, height, width, bgcolor in any tag
+        if "style" in tattr.keys():
+            style = tattr.pop("style").strip()
+            if style:
+                styles.append(style)
+
+        if "align" in tattr.keys():
+            align = tattr.pop("align").strip()
+            if align:
+                if tname in ("table", "td", "tr"):
+                    pass
+                else:
+                    styles.append("text-align: %s" % align)
+
+        if "height" in tattr.keys():
+            height = tattr.pop("height").strip()
+            if (
+                height
+                and "<" not in height
+                and ">" not in height
+                and re.search(r"\d+", height)
+            ):
+                if tname in ("table", "td", "tr"):
+                    pass
+                elif tname == "img":
+                    tattr["height"] = height
+                else:
+                    styles.append("margin-top: %s" % self.ensure_unit(height))
+
+        if "width" in tattr.keys():
+            width = tattr.pop("width").strip()
+            if width and re.search(r"\d+", width):
+                if tname in ("table", "td", "tr"):
+                    pass
+                elif tname == "img":
+                    tattr["width"] = width
+                else:
+                    styles.append("text-indent: %s" % self.ensure_unit(width))
+                    if width.startswith("-"):
+                        styles.append("margin-left: %s" % self.ensure_unit(width[1:]))
+
+        if "bgcolor" in tattr.keys():
+            # no proprietary html allowed
+            if tname == "div":
+                del tattr["bgcolor"]
+
+        elif tname == "font":
+            # Change font tags to span tags
+            tname = "span"
+            if ttype in ("begin", "single", "single_ext"):
+                # move the face attribute to css font-family
+                if "face" in tattr.keys():
+                    face = tattr.pop("face").strip()
+                    styles.append('font-family: "%s"' % face)
+
+                    # Monitor the constantly changing font sizes, change them to ems and move
+                    # them to css. The following will work for 'flat' font tags, but nested font tags
+                    # will cause things to go wonky. Need to revert to the parent font tag's size
+                    # when a closing tag is encountered.
+                if "size" in tattr.keys():
+                    sz = tattr.pop("size").strip().lower()
+                    try:
+                        float(sz)
+                    except ValueError:
+                        if sz in size_map.keys():
+                            sz = size_map[sz]
+                    else:
+                        if sz.startswith("-") or sz.startswith("+"):
+                            sz = self.current_font_size + float(sz)
+                            if sz > 7:
+                                sz = 7
+                            elif sz < 1:
+                                sz = 1
+                            sz = str(int(sz))
+                    styles.append("font-size: %s" % size_to_em_map[sz])
+                    self.current_font_size = int(sz)
+
+        elif tname == "img":
+            for attr in ("width", "height"):
+                if attr in tattr:
+                    val = tattr[attr]
+                    if val.lower().endswith("em"):
+                        try:
+                            nval = float(val[:-2])
+                            nval *= 16 * (
+                                168.451 / 72
+                            )  # Assume this was set using the Kindle profile
+                            tattr[attr] = "%dpx" % int(nval)
+                        except:
+                            del tattr[attr]
+                    elif val.lower().endswith("%"):
+                        del tattr[attr]
+
+        # convert the anchor tags
+        if "filepos-id" in tattr:
+            tattr["id"] = tattr.pop("filepos-id")
+            if "name" in tattr and tattr["name"] != tattr["id"]:
+                tattr["name"] = tattr["id"]
+
+        if "filepos" in tattr:
+            filepos = tattr.pop("filepos")
+            try:
+                tattr["href"] = "#filepos%d" % int(filepos)
+            except ValueError:
+                pass
+
+        if styles:
+            ncls = None
+            rule = "; ".join(styles)
+            for sel, srule in self.tag_css_rules.items():
+                if srule == rule:
+                    ncls = sel
+                    break
+            if ncls is None:
+                self.tag_css_rule_cnt += 1
+                ncls = "rule_%d" % self.tag_css_rule_cnt
+                self.tag_css_rules[ncls] = rule
+            cls = tattr.get("class", "")
+            cls = cls + (" " if cls else "") + ncls
+            tattr["class"] = cls
+
+        # convert updated tag back to string representation
+        if len(tattr) == 0:
+            tattr = None
+        taginfo = (ttype, tname, tattr)
+        return self.taginfo_tostring(taginfo)
+
+
+""" main only left in for testing outside of plugin """
+
+
+def main(argv=sys.argv):
+    if len(argv) != 2:
+        return 1
+    else:
+        infile = argv[1]
+
+    try:
+        print "Converting Mobi Markup Language to XHTML"
+        mlc = MobiMLConverter(infile)
+        print "Processing ..."
+        htmlstr, css, cssname = mlc.processml()
+        outname = infile.rsplit(".", 1)[0] + "_converted.html"
+        file(outname, "wb").write(htmlstr)
+        file(cssname, "wb").write(css)
+        print "Completed"
+        print "XHTML version of book can be found at: " + outname
+
+    except ValueError, e:
+        print "Error: %s" % e
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/mobiparse/mobi/unipath.py
+++ b/mobiparse/mobi/unipath.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list
+# of conditions and the following disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+from .compatibility_utils import PY2, text_type, binary_type
+
+import sys
+import os
+
+# utility routines to convert all paths to be full unicode
+
+# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding
+# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it
+
+# Mac OS X and Windows will happily support full unicode paths
+# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode
+
+fsencoding = sys.getfilesystemencoding()
+
+
+def pathof(s, enc=fsencoding):
+    if s is None:
+        return None
+    if isinstance(s, text_type):
+        return s
+    if isinstance(s, binary_type):
+        try:
+            return s.decode(enc)
+        except:
+            pass
+    return s
+
+
+def exists(s):
+    return os.path.exists(pathof(s))
+
+
+def isfile(s):
+    return os.path.isfile(pathof(s))
+
+
+def isdir(s):
+    return os.path.isdir(pathof(s))
+
+
+def mkdir(s):
+    return os.mkdir(pathof(s))
+
+
+def listdir(s):
+    rv = []
+    for file in os.listdir(pathof(s)):
+        rv.append(pathof(file))
+    return rv
+
+
+def getcwd():
+    if PY2:
+        return os.getcwdu()
+    return os.getcwd()
+
+
+def walk(top):
+    top = pathof(top)
+    rv = []
+    for base, dnames, names in os.walk(top):
+        base = pathof(base)
+        for name in names:
+            name = pathof(name)
+            rv.append(relpath(os.path.join(base, name), top))
+    return rv
+
+
+def relpath(path, start=None):
+    return os.path.relpath(pathof(path), pathof(start))
+
+
+def abspath(path):
+    return os.path.abspath(pathof(path))
--- a/mobiparse/mobi/unpack_structure.py
+++ b/mobiparse/mobi/unpack_structure.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+from .compatibility_utils import text_type
+
+from . import unipath
+from .unipath import pathof
+
+DUMP = False
+""" Set to True to dump all possible information. """
+
+import os
+
+import re
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+import zipfile
+import binascii
+from .mobi_utils import mangle_fonts
+
+
+class unpackException(Exception):
+    pass
+
+
+class ZipInfo(zipfile.ZipInfo):
+    def __init__(self, *args, **kwargs):
+        if "compress_type" in kwargs:
+            compress_type = kwargs.pop("compress_type")
+        super(ZipInfo, self).__init__(*args, **kwargs)
+        self.compress_type = compress_type
+
+
+class fileNames:
+    def __init__(self, infile, outdir):
+        self.infile = infile
+        self.outdir = outdir
+        if not unipath.exists(self.outdir):
+            unipath.mkdir(self.outdir)
+        self.mobi7dir = os.path.join(self.outdir, "mobi7")
+        if not unipath.exists(self.mobi7dir):
+            unipath.mkdir(self.mobi7dir)
+        self.imgdir = os.path.join(self.mobi7dir, "Images")
+        if not unipath.exists(self.imgdir):
+            unipath.mkdir(self.imgdir)
+        self.hdimgdir = os.path.join(self.outdir, "HDImages")
+        if not unipath.exists(self.hdimgdir):
+            unipath.mkdir(self.hdimgdir)
+        self.outbase = os.path.join(
+            self.outdir, os.path.splitext(os.path.split(infile)[1])[0]
+        )
+
+    def getInputFileBasename(self):
+        return os.path.splitext(os.path.basename(self.infile))[0]
+
+    def makeK8Struct(self):
+        self.k8dir = os.path.join(self.outdir, "mobi8")
+        if not unipath.exists(self.k8dir):
+            unipath.mkdir(self.k8dir)
+        self.k8metainf = os.path.join(self.k8dir, "META-INF")
+        if not unipath.exists(self.k8metainf):
+            unipath.mkdir(self.k8metainf)
+        self.k8oebps = os.path.join(self.k8dir, "OEBPS")
+        if not unipath.exists(self.k8oebps):
+            unipath.mkdir(self.k8oebps)
+        self.k8images = os.path.join(self.k8oebps, "Images")
+        if not unipath.exists(self.k8images):
+            unipath.mkdir(self.k8images)
+        self.k8fonts = os.path.join(self.k8oebps, "Fonts")
+        if not unipath.exists(self.k8fonts):
+            unipath.mkdir(self.k8fonts)
+        self.k8styles = os.path.join(self.k8oebps, "Styles")
+        if not unipath.exists(self.k8styles):
+            unipath.mkdir(self.k8styles)
+        self.k8text = os.path.join(self.k8oebps, "Text")
+        if not unipath.exists(self.k8text):
+            unipath.mkdir(self.k8text)
+
+    # recursive zip creation support routine
+    def zipUpDir(self, myzip, tdir, localname):
+        currentdir = tdir
+        if localname != "":
+            currentdir = os.path.join(currentdir, localname)
+        list = unipath.listdir(currentdir)
+        for file in list:
+            afilename = file
+            localfilePath = os.path.join(localname, afilename)
+            realfilePath = os.path.join(currentdir, file)
+            if unipath.isfile(realfilePath):
+                myzip.write(
+                    pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED
+                )
+            elif unipath.isdir(realfilePath):
+                self.zipUpDir(myzip, tdir, localfilePath)
+
+    def makeEPUB(self, usedmap, obfuscate_data, uid):
+        bname = os.path.join(self.k8dir, self.getInputFileBasename() + ".epub")
+        # Create an encryption key for Adobe font obfuscation
+        # based on the epub's uid
+        if isinstance(uid, text_type):
+            uid = uid.encode("ascii")
+        if obfuscate_data:
+            key = re.sub(br"[^a-fA-F0-9]", b"", uid)
+            key = binascii.unhexlify((key + key)[:32])
+
+        # copy over all images and fonts that are actually used in the ebook
+        # and remove all font files from mobi7 since not supported
+        imgnames = unipath.listdir(self.imgdir)
+        for name in imgnames:
+            if usedmap.get(name, "not used") == "used":
+                filein = os.path.join(self.imgdir, name)
+                if name.endswith(".ttf"):
+                    fileout = os.path.join(self.k8fonts, name)
+                elif name.endswith(".otf"):
+                    fileout = os.path.join(self.k8fonts, name)
+                elif name.endswith(".failed"):
+                    fileout = os.path.join(self.k8fonts, name)
+                else:
+                    fileout = os.path.join(self.k8images, name)
+                data = b""
+                with open(pathof(filein), "rb") as f:
+                    data = f.read()
+                if obfuscate_data:
+                    if name in obfuscate_data:
+                        data = mangle_fonts(key, data)
+                open(pathof(fileout), "wb").write(data)
+                if name.endswith(".ttf") or name.endswith(".otf"):
+                    os.remove(pathof(filein))
+
+        # opf file name hard coded to "content.opf"
+        container = '<?xml version="1.0" encoding="UTF-8"?>\n'
+        container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
+        container += "    <rootfiles>\n"
+        container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>'
+        container += "    </rootfiles>\n</container>\n"
+        fileout = os.path.join(self.k8metainf, "container.xml")
+        with open(pathof(fileout), "wb") as f:
+            f.write(container.encode("utf-8"))
+
+        if obfuscate_data:
+            encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \
+xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n'
+            for font in obfuscate_data:
+                encryption += "  <enc:EncryptedData>\n"
+                encryption += '    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n'
+                encryption += "    <enc:CipherData>\n"
+                encryption += (
+                    '      <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n'
+                )
+                encryption += "    </enc:CipherData>\n"
+                encryption += "  </enc:EncryptedData>\n"
+            encryption += "</encryption>\n"
+            fileout = os.path.join(self.k8metainf, "encryption.xml")
+            with open(pathof(fileout), "wb") as f:
+                f.write(encryption.encode("utf-8"))
+
+        # ready to build epub
+        self.outzip = zipfile.ZipFile(pathof(bname), "w")
+
+        # add the mimetype file uncompressed
+        mimetype = b"application/epub+zip"
+        fileout = os.path.join(self.k8dir, "mimetype")
+        with open(pathof(fileout), "wb") as f:
+            f.write(mimetype)
+        nzinfo = ZipInfo("mimetype", compress_type=zipfile.ZIP_STORED)
+        nzinfo.external_attr = 0o600 << 16  # make this a normal file
+        self.outzip.writestr(nzinfo, mimetype)
+        self.zipUpDir(self.outzip, self.k8dir, "META-INF")
+        self.zipUpDir(self.outzip, self.k8dir, "OEBPS")
+        self.outzip.close()
--- a/mobiparse/mobi/x
+++ b/mobiparse/mobi/x
@@ -0,0 +1,21 @@
+        # KF8 (Mobi 8)
+        if mh.isK8():
+            processMobi8(
+                mh,
+                metadata,
+                sect,
+                files,
+                rscnames,
+                pagemapproc,
+                k8resc,
+                obfuscate_data,
+                apnxfile,
+                epubver,
+            )
+
+        # Old Mobi (Mobi 7)
+        elif not k8only:
+            processMobi7(mh, metadata, sect, files, rscnames)
+
+        # CGDBG
+        print('k8only {} mh.isK8() {}'.format(k8only, mh.isK8()))