kindle manager
This commit is contained in:
7
mobiparse/mobi/__init__.py
Executable file
7
mobiparse/mobi/__init__.py
Executable file
@@ -0,0 +1,7 @@
|
||||
import os
|
||||
|
||||
os.environ["LOGURU_AUTOINIT"] = "False"
|
||||
from mobi.extract import extract
|
||||
from mobi.extract import extracttest
|
||||
|
||||
__version__ = "0.3.1"
|
||||
295
mobiparse/mobi/compatibility_utils.py
Executable file
295
mobiparse/mobi/compatibility_utils.py
Executable file
@@ -0,0 +1,295 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice, this list
|
||||
# of conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||
# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
||||
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
PY2 = sys.version_info[0] == 2
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
iswindows = sys.platform.startswith("win")
|
||||
|
||||
try:
|
||||
from urllib.parse import unquote
|
||||
except ImportError:
|
||||
from urllib import unquote
|
||||
|
||||
if PY2:
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
_h = HTMLParser()
|
||||
elif sys.version_info[1] < 4:
|
||||
import html.parser
|
||||
|
||||
_h = html.parser.HTMLParser()
|
||||
else:
|
||||
import html as _h
|
||||
|
||||
if PY3:
|
||||
text_type = str
|
||||
binary_type = bytes
|
||||
# if will be printing arbitraty binary data to stdout on python 3
|
||||
# sys.stdin = sys.stdin.detach()
|
||||
# sys.stdout = sys.stdout.detach()
|
||||
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
|
||||
else:
|
||||
range = xrange
|
||||
text_type = unicode
|
||||
binary_type = str
|
||||
# if will be printing unicode under python 2 need to protect
|
||||
# against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
|
||||
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
|
||||
# alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8
|
||||
|
||||
# NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
|
||||
# (and they amazingly claim by design and no bug!)
|
||||
|
||||
# To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
|
||||
# >>> o = '123456789'
|
||||
# >>> o[-3]
|
||||
# '7'
|
||||
# >>> type(o[-3])
|
||||
# <class 'str'>
|
||||
# >>> type(o)
|
||||
# <class 'str'>
|
||||
|
||||
# Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
|
||||
# >>> o = b'123456789'
|
||||
# >>> o[-3]
|
||||
# 55
|
||||
# >>> type(o[-3])
|
||||
# <class 'int'>
|
||||
# >>> type(o)
|
||||
# <class 'bytes'>
|
||||
|
||||
# This mind boggling behaviour also happens when indexing a bytestring and/or
|
||||
# iteratoring over a bytestring. In other words it will return an int but not
|
||||
# the byte itself!!!!!!!
|
||||
|
||||
# The only way to access a single byte as a byte in bytestring and get the byte in both
|
||||
# Python 2 and Python 3 is to use a slice
|
||||
|
||||
# This problem is so common there are horrible hacks floating around the net to **try**
|
||||
# to work around it, so that code that works on both Python 2 and Python 3 is possible.
|
||||
|
||||
# So in order to write code that works on both Python 2 and Python 3
|
||||
# if you index or access a single byte and want its ord() then use the bord() function.
|
||||
# If instead you want it as a single character byte use the bchar() function
|
||||
# both of which are defined below.
|
||||
|
||||
if PY3:
|
||||
# Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
|
||||
# in place of ascii you will get a byte value to half-word or integer value
|
||||
# one-to-one mapping (in the 0 - 255 range)
|
||||
|
||||
def bchr(s):
|
||||
return bytes([s])
|
||||
|
||||
def bstr(s):
|
||||
if isinstance(s, str):
|
||||
return bytes(s, "latin-1")
|
||||
else:
|
||||
return bytes(s)
|
||||
|
||||
def bord(s):
|
||||
return s
|
||||
|
||||
def bchar(s):
|
||||
return bytes([s])
|
||||
|
||||
|
||||
else:
|
||||
|
||||
def bchr(s):
|
||||
return chr(s)
|
||||
|
||||
def bstr(s):
|
||||
return str(s)
|
||||
|
||||
def bord(s):
|
||||
return ord(s)
|
||||
|
||||
def bchar(s):
|
||||
return s
|
||||
|
||||
|
||||
if PY3:
|
||||
# list-producing versions of the major Python iterating functions
|
||||
def lrange(*args, **kwargs):
|
||||
return list(range(*args, **kwargs))
|
||||
|
||||
def lzip(*args, **kwargs):
|
||||
return list(zip(*args, **kwargs))
|
||||
|
||||
def lmap(*args, **kwargs):
|
||||
return list(map(*args, **kwargs))
|
||||
|
||||
def lfilter(*args, **kwargs):
|
||||
return list(filter(*args, **kwargs))
|
||||
|
||||
|
||||
else:
|
||||
import __builtin__
|
||||
|
||||
# Python 2-builtin ranges produce lists
|
||||
lrange = __builtin__.range
|
||||
lzip = __builtin__.zip
|
||||
lmap = __builtin__.map
|
||||
lfilter = __builtin__.filter
|
||||
|
||||
# In Python 3 you can no longer use .encode('hex') on a bytestring
|
||||
# instead use the following on both platforms
|
||||
import binascii
|
||||
|
||||
|
||||
def hexlify(bdata):
|
||||
return (binascii.hexlify(bdata)).decode("ascii")
|
||||
|
||||
|
||||
# If you: import struct
|
||||
# Note: struct pack, unpack, unpack_from all *require* bytestring format
|
||||
# data all the way up to at least Python 2.7.5, Python 3 is okay with either
|
||||
|
||||
# If you: import re
|
||||
# note: Python 3 "re" requires the pattern to be the exact same type as the data to be
|
||||
# searched ... but u"" is not allowed for the pattern itself only b""
|
||||
# Python 2.X allows the pattern to be any type and converts it to match the data
|
||||
# and returns the same type as the data
|
||||
|
||||
# convert string to be utf-8 encoded
|
||||
def utf8_str(p, enc="utf-8"):
|
||||
if p is None:
|
||||
return None
|
||||
if isinstance(p, text_type):
|
||||
return p.encode("utf-8")
|
||||
if enc != "utf-8":
|
||||
return p.decode(enc).encode("utf-8")
|
||||
return p
|
||||
|
||||
|
||||
# convert string to be unicode encoded
|
||||
def unicode_str(p, enc="utf-8"):
|
||||
if p is None:
|
||||
return None
|
||||
if isinstance(p, text_type):
|
||||
return p
|
||||
return p.decode(enc)
|
||||
|
||||
|
||||
ASCII_CHARS = set(chr(x) for x in range(128))
|
||||
URL_SAFE = set(
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789" "#" "_.-/~"
|
||||
)
|
||||
IRI_UNSAFE = ASCII_CHARS - URL_SAFE
|
||||
|
||||
# returns a quoted IRI (not a URI)
|
||||
def quoteurl(href):
|
||||
if isinstance(href, binary_type):
|
||||
href = href.decode("utf-8")
|
||||
result = []
|
||||
for char in href:
|
||||
if char in IRI_UNSAFE:
|
||||
char = "%%%02x" % ord(char)
|
||||
result.append(char)
|
||||
return "".join(result)
|
||||
|
||||
|
||||
# unquotes url/iri
|
||||
def unquoteurl(href):
|
||||
if isinstance(href, binary_type):
|
||||
href = href.decode("utf-8")
|
||||
href = unquote(href)
|
||||
return href
|
||||
|
||||
|
||||
# unescape html
|
||||
def unescapeit(sval):
|
||||
return _h.unescape(sval)
|
||||
|
||||
|
||||
# Python 2.X commandline parsing under Windows has been horribly broken for years!
|
||||
# Use the following code to emulate full unicode commandline parsing on Python 2
|
||||
# ie. To get sys.argv arguments and properly encode them as unicode
|
||||
|
||||
|
||||
def unicode_argv():
|
||||
global iswindows
|
||||
global PY3
|
||||
if PY3:
|
||||
return sys.argv
|
||||
if iswindows:
|
||||
# Versions 2.x of Python don't support Unicode in sys.argv on
|
||||
# Windows, with the underlying Windows API instead replacing multi-byte
|
||||
# characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv
|
||||
# as a list of Unicode strings
|
||||
from ctypes import POINTER, byref, cdll, c_int, windll
|
||||
from ctypes.wintypes import LPCWSTR, LPWSTR
|
||||
|
||||
GetCommandLineW = cdll.kernel32.GetCommandLineW
|
||||
GetCommandLineW.argtypes = []
|
||||
GetCommandLineW.restype = LPCWSTR
|
||||
|
||||
CommandLineToArgvW = windll.shell32.CommandLineToArgvW
|
||||
CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
|
||||
CommandLineToArgvW.restype = POINTER(LPWSTR)
|
||||
|
||||
cmd = GetCommandLineW()
|
||||
argc = c_int(0)
|
||||
argv = CommandLineToArgvW(cmd, byref(argc))
|
||||
if argc.value > 0:
|
||||
# Remove Python executable and commands if present
|
||||
start = argc.value - len(sys.argv)
|
||||
return [argv[i] for i in range(start, argc.value)]
|
||||
# this should never happen
|
||||
return None
|
||||
else:
|
||||
argv = []
|
||||
argvencoding = sys.stdin.encoding
|
||||
if argvencoding is None:
|
||||
argvencoding = sys.getfilesystemencoding()
|
||||
if argvencoding is None:
|
||||
argvencoding = "utf-8"
|
||||
for arg in sys.argv:
|
||||
if isinstance(arg, text_type):
|
||||
argv.append(arg)
|
||||
else:
|
||||
argv.append(arg.decode(argvencoding))
|
||||
return argv
|
||||
|
||||
|
||||
# Python 2.X is broken in that it does not recognize CP65001 as UTF-8
|
||||
def add_cp65001_codec():
|
||||
if PY2:
|
||||
try:
|
||||
codecs.lookup("cp65001")
|
||||
except LookupError:
|
||||
codecs.register(
|
||||
lambda name: name == "cp65001" and codecs.lookup("utf-8") or None
|
||||
)
|
||||
return
|
||||
218
mobiparse/mobi/extract.py
Executable file
218
mobiparse/mobi/extract.py
Executable file
@@ -0,0 +1,218 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import shutil
|
||||
import json
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
import tempfile
|
||||
from os.path import basename, splitext, exists, join
|
||||
from mobi.kindleunpack import unpackBook
|
||||
from mobi.makencx import extractNcx
|
||||
|
||||
def extract(infile):
|
||||
"""Extract mobi file and return path to epub file"""
|
||||
|
||||
tempdir = tempfile.mkdtemp(prefix="mobiex")
|
||||
if hasattr(infile, "fileno"):
|
||||
tempname = next(tempfile._get_candidate_names()) + ".mobi"
|
||||
pos = infile.tell()
|
||||
infile.seek(0)
|
||||
|
||||
with open(join(tempdir, tempname), "wb") as outfile:
|
||||
shutil.copyfileobj(infile, outfile)
|
||||
|
||||
infile.seek(pos)
|
||||
infile = join(tempdir, tempname)
|
||||
|
||||
logger.debug("file: %s" % infile)
|
||||
fname_in = basename(infile)
|
||||
base, ext = splitext(fname_in)
|
||||
fname_out_epub = base + ".epub"
|
||||
fname_out_html = "book.html"
|
||||
fname_out_pdf = base + ".001.pdf"
|
||||
|
||||
unpackBook(infile, tempdir, epubver="A")
|
||||
|
||||
epub_filepath = join(tempdir, "mobi8", fname_out_epub)
|
||||
html_filepath = join(tempdir, "mobi7", fname_out_html)
|
||||
pdf_filepath = join(tempdir, fname_out_pdf)
|
||||
if exists(epub_filepath):
|
||||
return tempdir, epub_filepath
|
||||
elif exists(html_filepath):
|
||||
return tempdir, html_filepath
|
||||
elif exists(pdf_filepath):
|
||||
return tempdir, pdf_filepath
|
||||
raise ValueError("Coud not extract from %s" % infile)
|
||||
|
||||
def extracttest(infile):
|
||||
"""Extract mobi file and return path to epub file"""
|
||||
|
||||
tempdir = './t/'
|
||||
if hasattr(infile, "fileno"):
|
||||
tempname = next(tempfile._get_candidate_names()) + ".mobi"
|
||||
pos = infile.tell()
|
||||
infile.seek(0)
|
||||
with open(join(tempdir, tempname), "wb") as outfile:
|
||||
shutil.copyfileobj(infile, outfile)
|
||||
infile.seek(pos)
|
||||
infile = join(tempdir, tempname)
|
||||
# tempname 8x2vf7yv.mobi pos 0 infile ./t/8x2vf7yv.mobi
|
||||
print('tempname {} pos {} infile {}'.format(tempname,pos,infile))
|
||||
|
||||
logger.debug("file: %s" % infile)
|
||||
fname_in = basename(infile)
|
||||
base, ext = splitext(fname_in)
|
||||
fname_out_epub = base + ".epub"
|
||||
fname_out_html = "book.html"
|
||||
fname_out_pdf = base + ".001.pdf"
|
||||
|
||||
# infile ./t/8x2vf7yv.mobi
|
||||
unpackBook(infile, tempdir, epubver="A")
|
||||
|
||||
epub_filepath = join(tempdir, "mobi8", fname_out_epub)
|
||||
html_filepath = join(tempdir, "mobi7", fname_out_html)
|
||||
pdf_filepath = join(tempdir, fname_out_pdf)
|
||||
|
||||
# CGDBG
|
||||
#epub_filepath ./t/mobi8/p302tbwb.epub html_filepath ./t/mobi7/book.html pdf_filepath ./t/p302tbwb.001.pdf
|
||||
print('epub_filepath {} html_filepath {} pdf_filepath {}'.format( epub_filepath, html_filepath, pdf_filepath))
|
||||
|
||||
if exists(epub_filepath):
|
||||
return tempdir, epub_filepath
|
||||
elif exists(html_filepath):
|
||||
return tempdir, html_filepath
|
||||
elif exists(pdf_filepath):
|
||||
return tempdir, pdf_filepath
|
||||
raise ValueError("Coud not extract from %s" % infile)
|
||||
|
||||
## CG test extractNcx
|
||||
def extract_ncx_test():
|
||||
#infile = "./tests/youxi.mobi"
|
||||
#infile = "./tests/xiaodao.mobi"
|
||||
filelst = [
|
||||
'youxi.mobi',
|
||||
'xiaodao.mobi',
|
||||
'laocan.azw3',
|
||||
# 'shikong.kfx',
|
||||
'shisu.azw',
|
||||
'ETF全球投资指南(一个账户,投资全球。告诉你用美股ETF投什么、怎么投,低门槛、低成本地进行全球化投资和配_JYT5RZBTKMUUJCV2XZ6RWZFNJWAS3KWP.azw3',
|
||||
'正義_ 一場思辨之旅_SVQL2PQHADT6UEVYYMJLNLP3TNVZZ2PO.azw3',
|
||||
'赘婿_IFYXK7EVMAJZU6SRDI4G3PF6KG5AA3CF.azw',
|
||||
'熊逸·佛学50讲_MW5FG7LWUA3G5ZSRJHOI6YNS4E6EC3AP.azw',
|
||||
'哲学·科学·常识_Q7IU43GSXSQ3TZULYN46U5DSTJBOSVTW.azw3',
|
||||
'理想国2020年豆瓣高分文学作品集【理想国2020年度最受好评新书】_H5LB5WWKBT55NDVJ44TGWNP3YWUXMGAN.azw',
|
||||
'大问题_ 简明哲学导论_3MGHFPCGYUPS5OX6PLI2U2ZN2D4QY6IP.azw',
|
||||
'利维坦_BQNVQ3PLMU5NABEUEYPESXWK4KEVXH2V.azw',
|
||||
#'马克斯·韦伯作品集(套装6册)【现代社会学奠基人,余英时、苏国勋推荐译本 理想国出品】_LUVR3MEUTXTZLMGRQS7RBEDQ32QB2WDZ.azw',
|
||||
'物种起源 (译林人文精选)_EFRUCMHSILFZ7IY75OF6KU3FXKG5DYQS.azw',
|
||||
'公式之美 (中国好书,人类最美的23个公式)_JWP4OH34GQTHTM4UB5KVTDDPMV5Q45XW.azw',
|
||||
'經濟學人104個大解惑:從紙鈔面額、廣告祕辛,到航空公司如何節省成本的全面揭密_RSN23YIUQOGLI5XMNP4UUWZFFBKW7SWT.azw',
|
||||
'快思慢想_6RD4VKANWTU2EIIS5ZQSW4H2WAVW6BKS.azw3',
|
||||
'老残游记_GYDUNWUDXOSL6GX376HXF6Y5BZB4BYWK.azw3',
|
||||
'美的历程_HRFLXR3SWA7YIMUHHBMOHKIFCXTKWDEN.azw',
|
||||
'世俗时代_JAJCAPFHKMBTWHQC3Z3M23MQHVM4LD5H.azw',
|
||||
'先知之後_OVMTC2SMQGT6WR5BEPB3NLIHPEXAPOZ2.azw',
|
||||
'股市稳赚_TKUOH7HY4AFL6MHGDXOA2JTJMDQMJ2M2.azw',
|
||||
'货币野史_TNDZWSPMBH3OU3GA4OONJOOS5N3RML46.azw',
|
||||
'黄金时代_UECZKDFEYBBCXD7VEGKUF3JAIZCNQTLD.azw',
|
||||
'哲学小引_ZIPCDHPBDNDXGPGFMHVMWTW2T43H4H33.azw',
|
||||
'危险的关系(译文名著精选)_LCIX4CJDXKBN44DFWQC6BNQP5T4QUJGQ.azw3',
|
||||
'生命是什么-活细胞的物理观_KHTK74YI2ZCNNLTDNO42O5TEUCFY4OAE.azw',
|
||||
'时间的边缘_FTWBORVACPNFZEHJGDPP3F5G3ZTP2API.azw3',
|
||||
'苏菲的世界_GY2VU27R6NLR5DYLSK2X3SGMDOS6OEHA.azw',
|
||||
'责任的重负_IOEVP74VFX7HI7CZLHNVDYUSBR5SKWHP.azw',
|
||||
'非常潜力股_RSO2TPJMR4Z5GRVVK27M6CZIOKXDOGFA.azw',
|
||||
'超级聊天学_为中国人量身定制的口才实操指南(会聊天,瞬间提升你的魅力指数,和任何人都能说上话,和任何人_UJSIR4LXOYUXNN3VKNM2VEJAZWPWXIA5.azw3',
|
||||
'世说新语精读 (汉语言文学原典精读系列)_EIQTMTBSOBUBAZTIO76QY3UE5DHG7QUX.azw',
|
||||
'打开:周濂的100堂西方哲学课(一部有营养、有态度,读得懂、读得动的西方哲学史)_NBLLIRUDZVFARGUMVTO3IYM7RZFWAHF5.azw',
|
||||
'中国游戏风云_45BZOYKQWCVIMPV5J5TUQB3A4WMSXG6Y.azw3',
|
||||
'顾准历史笔记_664YSYONYO3WYLR7KN4EHLAVN5AZYJPJ.azw',
|
||||
'克拉拉与太阳_AHT6ZF3TVFNTKQLLAZJHBPWVMXHXP6HC.azw',
|
||||
'现代汉英词典_B00771V9HS.azw',
|
||||
'现代汉语词典_B00AKJGTAQ.azw',
|
||||
'香帅财富报告_US63WJUWKWW2Y7GPKCIDMRVUEYTQVOKM.azw',
|
||||
'送你一颗子弹_YIFYW7SIWRQULQ5I7OHSFWW5GWJQPRXF.azw',
|
||||
'从沸腾到癫狂_泡沫背后的中国房地产真相_Z67UZLBWFW7E7RXRCMSBQK2X2TQJMV54.azw',
|
||||
'诸子百家闪耀时 (豆瓣“大神”林欣浩首部中国哲学史力作)_Z65JN5SBZKB5CXWP4QEJ5N36WBJK7LYG.azw',
|
||||
'增广贤文(精)--中华经典名著全本全注全译 (中华书局)_55WRYKJTRRUSR3F7PUHSCT7VCYLNXAO5.azw',
|
||||
'吃货的孤单心事_4TSS34PIM3LDOE7STPZLPVI5Z2NJ63ZA.azw3',
|
||||
'专业团队的管理_CZQZGKOZ2O4B5657CBJX62YTDTHGFTHI.azw3',
|
||||
'《新青年》文选_D63BJHTSQMXJOD3WZMITH5MCXPPX4TQO.azw3',
|
||||
'中国改革三步走_YAIFAT7KHQG3FKXYDBGDAODOQB3IXREP.azw3',
|
||||
'手把手教你读财报2——18节课看透银行业_N2ULAPFBPTYYROEZR6OXSFRZHWGVPP47.azw',
|
||||
'伯罗奔尼撒战争史_7IG7PYJEFQCQZKJFADJQPB63WOJZ457K.azw',
|
||||
'股市投资致富之道_DRFDHRKUZOGBOQANCZ7WCXQPO4IFXLZ3.azw',
|
||||
'中国人的性格历程_F7WWWMHTDUNPAJPQKPMRQQBUVFVWVO2J.azw',
|
||||
'股市投资致富之道_QE5QXSGVXZIU6YKQ37ILICN3Y4KMOHFB.azw',
|
||||
'手把手教你读财报_财报是用来排除企业的_BIO354K2A7W6AKDRG672GCTOW256C7W3.azw',
|
||||
'不可思议的自然对数 (探秘数学常数)_RXHNPMVRRSAJGPFS4EZMEW6YRIOAUJHL.azw',
|
||||
'关键选择【帮助起底27个赚钱的逻辑,经济下行趋势下的个人财富增长方案】_XX73ZPD6RPCZSB3XBKBTZZSC5RKQDXJO.azw',
|
||||
'中国法律与中国社会_SH3O32FVYFABLOLPVCAQEG2YFWSLTZKY.azw',
|
||||
'印度,漂浮的次大陆_YGTGYKH3CAVXF5QZIORNOU4HY3JWUFT2.azw',
|
||||
'关于那个人的备忘录_ZCIQC6KSVBPKAIOJJSASQ4T7CUSC23IO.azw',
|
||||
'尼采哲学经典(套装共5册) (李敖力荐台湾经典译本)_OQ6CLDRDV34ZPOF2ZLK6CJTMMDJMLVEW.azw',
|
||||
'刘擎西方现代思想讲义_2X4PDLIAHHBVF2JV4WS6SSWMEU6WCC2W.azw3',
|
||||
'故事是这个世界的解药_YIRJ7ONGJKHJF4VHEGAHXCRAZRUQWDC5.azw',
|
||||
'哲学家们都干了些什么_(2015年全新修订版)_7YZCO42RPEELUVMTCWMXV7VAFJHNUBH3.azw',
|
||||
'韩炳哲作品系列(套装共9册)_XQLLBYAIEHIDC6DDKPUSDUCXA5JJDV7E.azw3',
|
||||
'異常流行幻象與群眾瘋狂_54WCNGW254RB2VX2QAQOQJCTOIS5UDVZ.azw',
|
||||
'通胀来了,你准备好了吗_6EY3XVPBUCU3S2Z6GASBJ3ZFIYG7XAPV.azw3',
|
||||
'第二十二条军规:纪念版_XHGD4NJJO7IJJLI6NZQVR7GWGIPZASKU.azw',
|
||||
'反智:不願說理的人是偏執 不會說理的人是愚蠢 不敢說理的人是奴隸_FQGEEYA535SOIWBGTZMINNYUUUVOHGCO.azw',
|
||||
'中国货币史(校订版)豆瓣9.5分,好评如潮!_5QDPZ4BJCNFCQEW6OOTFT5WGGFEKJVCM.azw',
|
||||
'海洋与权力:一部新文明史_DZARAFK3BF3JM2Y6G275TNEHS3YFIY63.azw',
|
||||
'蘇東坡新傳(上下合併冊)_MQJ52QLZFG7GJIQXLGO6EOOEWDUPOTGC.azw',
|
||||
'刷新:重新发现商业与未来_SJJ2ZXTZYRIYOTA6PRLQNRBILPM6TN3O.azw',
|
||||
'禅宗是什么:胡适谈禅说佛_UV6UYZCLIBT7T5XOIZ3XEHLGYP524UUU.azw',
|
||||
'可塑的我:自我发展心理学的35堂必修课(自我发展心理学的35堂必修课)_LE5ZTKZITLXRBQVHN2GPNXODGRRZHE7C.azw3',
|
||||
'诺贝尔奖经济学合集(套装共5册)(经济学领域的集大成作品)_TD4FJVAQB35P77SCIG3VGERLQZJKG27G.azw',
|
||||
'费雪论成长股获利:投资大师80年投资致富的选股方法 (深受普通投资者欢迎与推荐的投资经典,巴菲特之师、“近_MXG3UMBD2MUQN3EEY7VC545U567VQC76.azw',
|
||||
'阿含经校注(全九册)【豆瓣9.6高分推荐!线装书局出品!一套书读通佛教“根本佛法”!阿含经专家、苏州西园_4Q3JWY3QBCLYZDBOUFU76KUYXNTXESBB.azw',
|
||||
'伤花怒放:摇滚的被缚与抗争_SGVGLBLRNURM5MCXKFKBRLUZS4HLAQOY.azw',
|
||||
'区块链:通往资产数字化之路_SHFSC7TLUVLQBYANOLNKJJTWNMHNNVZ5.azw',
|
||||
'历史的轨迹——二千年教会史_ZIMUOQ5JGFYGF4SRKFPGX5MEWMC6DPX2.azw',
|
||||
'深蓝帝国:英国海军的兴衰(全2册)【《星期日泰晤士报》年度最佳畅销书】 (甲骨文系列)_JRZSCWPDFXTNZRTBCR7HQ6BMXKLJ7QYG.azw3',
|
||||
'如何提升专注力:手把手教你用7项改变打造10倍速效率人生_S3NJU6MR5WP3ECFWFQ4NYWHYEB6P5RSZ.azw',
|
||||
'性欲和性行为:一种批判理论的99条断想(全2册)【舞蹈家金星、社会学家李银河推荐!继弗洛伊德、福柯之后的_UGCIUTVYKZYV6PDVDQPBM2JCDV7SX6VP.azw',
|
||||
'后疫情时代的全球经济与世界秩序(傅莹、蔡昉、江小娟、李扬、余永定、郑永年、迟福林、赵汀阳等20位学者合力_75MHOUFWOONZQOP5CY7E7HMI4P5SZPLT.azw3',
|
||||
'何为良好生活:行之于途而应于心_A2ETZLWN3OSZAVWDJOJBB3LTDSFLMHBC.azw',
|
||||
'万万没想到:用理工科思维理解世界_57F2POLFMUHUIDHE7KH6Z2Q732VQ5UX3.azw3',
|
||||
'徐远的投资课:投资原则与实战方法_7ESQ4POTRSIH4JOOHBL7PSWQ6JT4PLEZ.azw',
|
||||
'爆款短视频:如何频繁产出刷屏视频_EWJQBSGVTM2OXQKC5M77MKLFNWGL25CT.azw3',
|
||||
'思维的艺术:如何像哲学家一样思考_PSNIXLNDMEXX7T7K5J5NWRDIOFXY3XPX.azw',
|
||||
'牛津通识读本:佛学概论(中文版)_TUEO6SWCHMURQIGO3JKQRMZT5DWSYNYP.azw',
|
||||
'利益、制度与信息:国内政治与国际关系 (东方编译所译丛)_ESVTYWPTPGMVNMJJW7T5YP2LOQKFSHHY.azw',
|
||||
'哲学的故事【让深奥的哲学立刻生动起来!上市首年连续再版22次,迅速译成18种语言,掀起全球哲学热潮】_QCOIVA3QQN2BMYFTZQIXLGD45E2CSUNC.azw',
|
||||
'千年贸易战争史——贸易冲突与大国兴衰_ESA55ZHIJ6KVQY3OTCM3N5SB6BHDYKPC.azw3',
|
||||
'投资至简:从原点出发构建价值投资体系_GSKBIPYDNJJOAZUMNH4T3PZ2EHYYJPE4.azw3',
|
||||
'新金融秩序:如何应对不确定的金融风险(2013年诺贝尔经济学奖得主罗伯特•希勒著作)_YO7GZEODTNSUX4JJR5SRL7MOWP4PZFJ4.azw',
|
||||
'超越“街角发言者”:表达权的边缘与中心_P7SJECOJBQOYBJ5D4ZCSTPIFD2ZCMM6C.azw',
|
||||
'米开朗琪罗与教皇的天花板(甲骨文系列)_RIR7YAXJJSRRSGEPUU47ULOHDCAKQW7M.azw',
|
||||
'诗人十四个:十四位古代诗人和一位现代闯入者 一场始于1600年前的诗歌沙龙_UDDD6AF6HYA4V3SRY6KBN6M677757CCI.azw',
|
||||
'叔本华心灵咒语:请优雅地拥抱这个苦难的世界(获得独立的人格,做内心强大的自己!孤独而伟大的哲学家,道破_AZLKLSE2R3KAK5S6WXRYHPTPZLCX4RSO.azw',
|
||||
'世界观:现代人必须要懂的科学哲学和科学史(原书第3版)_WI6LHG2VGA6QITWLONOU4KOGNXKGB5EB.azw',
|
||||
'野兽绅士(单身男士必备撩妹宝典。坏男孩创始人、教父Tango终极力作!汇集超人气恋爱社区「坏男孩」中超经典_UZKJVFNZXM2C5GHYLRDJABRLKTHZYBFH.azw',
|
||||
'怪医笔记【薄世宁、李治中(菠萝)、姬十三、刘润、于莺 真诚力荐!期待值100%的医学题材小说,胸外科医生亲_FXMMP37MN4PRL2TAELQYDDMACRTECSZQ.azw',
|
||||
'不拘一格:网飞的自由与责任工作法(网飞官方图书,创始人兼CEO哈斯廷斯重磅作品。一家市值超2000亿美元,全球_3XOUEBICN4XWPCC5FESAMRAQSALMLABE.azw',
|
||||
'雅典帝国的覆亡(耶鲁大学教授为你讲述伯罗奔尼撒战争的后十年!)_HU7HIAFMRETK2JTCDVWRDNY35GSHSQGI.azw',
|
||||
'无规则游戏:阿富汗屡被中断的历史(本书获“美国北加州图书奖”提名)_SXDQBXUOP5W36IAZFWHXOALY7VS7WCKG.azw',
|
||||
'无限记忆力(如何在两周内记住更多知识,改善注意力,并培养出过目不忘的记忆力。 +附:21种实用的记忆力提升_5HCR7CSSQ32XICMV3ZQ5UY4N7AMFDT3S.azw',
|
||||
'我是谁,或什么:一部心与自我的辩证奇想集(关于“我”的终极哲学问题,嬉皮年代的思想群峰 本书可能烧掉你_C4HD4VEU4LC54IY4A7I43CK3PONLMNSJ.azw3',
|
||||
'高效论证:美国大学最实用的逻辑训练课(全美大学批判性思维经典教材,美国哲学学会重磅推荐)_FDA4AB6DURIL5RT7OZEXM76I2PUW7YCP.azw',
|
||||
'一把刀,千个字(茅盾文学奖获奖作家王安忆全新长篇;登顶《收获》长篇小说榜;人民文学出版社倾力打造)_ALQY4IH3LLNYF4KDK7LVXJGOWTYBH4PP.azw',
|
||||
'透明社会(以哲学小品文的简练和犀利,照察当今社会情状和人类心灵,洞穿数字媒体时代的群体狂欢和孤独个体_2YTTCLR4QTNFVCWKXZ7GIHEDSNQL6THE.azw',
|
||||
'不变与万变:葛剑雄说国史(中国当代历史学家、复旦大学教授葛剑雄重磅新作!全面勾勒中国历史发展的源与流_IOMBENXF5TGWX7F3BJLL322UTV5FDQTZ.azw',
|
||||
'下沉年代(白宫之乱的根本原因是什么?拜登上任后美国何去何从?通俗版《美国陷阱》,面对面访谈民主党前幕_KGL4UJC74AOO3HEMV6LRCMRZ5A6YPQGT.azw',
|
||||
'一生的读书计划(这一版有实质性的修订和扩充,最突出的变化是推荐的阅读材料的来源范围已经扩展到整个世界_SFGZNMVNLL3RE36GAERWSTDVEWRBZ4MG.azw',
|
||||
'三岛由纪夫典藏作品九部(两次入围诺贝尔奖的文学大师三岛由纪夫代表作;日本文学翻译家陈德文先生译本;人_TBNZC7F5EQ5YEKOODF6VMW2I2LBZRD4W.azw',
|
||||
]
|
||||
|
||||
for fn in filelst:
|
||||
mhdict = extractNcx(os.path.join('./tests',fn))
|
||||
print('process file {} \n {}'.format(fn,
|
||||
json.dumps(mhdict, indent=4, sort_keys=True, ensure_ascii=False)))
|
||||
|
||||
if __name__ == "__main__":
|
||||
#print(extracttest("../tests/demo.mobi"))
|
||||
#extract_ncx_test("../tests/demo.mobi")
|
||||
pass
|
||||
1199
mobiparse/mobi/kindleunpack.py
Executable file
1199
mobiparse/mobi/kindleunpack.py
Executable file
File diff suppressed because it is too large
Load Diff
97
mobiparse/mobi/makencx.py
Executable file
97
mobiparse/mobi/makencx.py
Executable file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from loguru import logger
|
||||
from collections import defaultdict
|
||||
|
||||
from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str
|
||||
from .compatibility_utils import unicode_argv, add_cp65001_codec
|
||||
|
||||
K8_BOUNDARY = b"BOUNDARY"
|
||||
""" The section data that divides K8 mobi ebooks. """
|
||||
|
||||
class unpackException(Exception):
|
||||
pass
|
||||
|
||||
# import the kindleunpack support libraries
|
||||
from .unpack_structure import fileNames
|
||||
from .mobi_sectioner import Sectionizer
|
||||
from .mobi_header import MobiHeader
|
||||
from .mobi_ncx import ncxExtract
|
||||
|
||||
|
||||
# input mobi file path
|
||||
# output ncx dict
|
||||
def extractNcx(infile):
|
||||
infile = unicode_str(infile)
|
||||
mhdict = defaultdict(dict)
|
||||
|
||||
# process the PalmDoc database header and verify it is a mobi
|
||||
sect = Sectionizer(infile)
|
||||
if sect.ident != b"BOOKMOBI" and sect.ident != b"TEXtREAd":
|
||||
raise unpackException("Invalid file format")
|
||||
|
||||
logger.debug( "dumppalmheader ...")
|
||||
sect.dumppalmheader()
|
||||
|
||||
# CGDBG
|
||||
print('infile {} '.format(infile))
|
||||
print('sect.dumpsectionsinfo() {}'.format(sect.dumpsectionsinfo()))
|
||||
print('sect.dumppalmheader() {}'.format(sect.dumppalmheader()))
|
||||
|
||||
# scan sections to see if this is a compound mobi file (K8 format)
|
||||
# and build a list of all mobi headers to process.
|
||||
mhlst = []
|
||||
|
||||
# CG mobi header
|
||||
mh = MobiHeader(sect, 0)
|
||||
metadata = mh.getMetaData()
|
||||
|
||||
# if this is a mobi8-only file hasK8 here will be true
|
||||
mhlst.append(mh)
|
||||
K8Boundary = -1
|
||||
|
||||
if mh.isK8():
|
||||
logger.debug("Unpacking a KF8 book...")
|
||||
hasK8 = True
|
||||
else:
|
||||
# CGDBG
|
||||
# This is either a Mobipocket 7 or earlier, or a combi M7/KF8
|
||||
# Find out which
|
||||
hasK8 = False
|
||||
for i in range(len(sect.sectionoffsets) - 1):
|
||||
before, after = sect.sectionoffsets[i : i + 2]
|
||||
if (after - before) == 8:
|
||||
data = sect.loadSection(i)
|
||||
if data == K8_BOUNDARY:
|
||||
sect.setsectiondescription(i, "Mobi/KF8 Boundary Section")
|
||||
mh = MobiHeader(sect, i + 1)
|
||||
hasK8 = True # K8
|
||||
mhlst.append(mh)
|
||||
K8Boundary = i
|
||||
break
|
||||
|
||||
# hasK8 header information include K8
|
||||
if hasK8:
|
||||
logger.debug( "Unpacking a Combination M{0:d}/KF8 book...".format(mh.version))
|
||||
else:
|
||||
logger.debug("Unpacking a Mobipocket {0:d} book...".format(mh.version))
|
||||
|
||||
# loop for process ncx and write to json with filename - booname.ncx.json
|
||||
for tmh in mhlst:
|
||||
# CG
|
||||
# process the toc ncx
|
||||
# ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num
|
||||
logger.debug("Processing ncx / toc ")
|
||||
print('hasK8 {} tmh.isK8 {}'.format(hasK8, tmh.isK8()))
|
||||
|
||||
ncx = ncxExtract(tmh)
|
||||
ncx_data = ncx.parseNCX()
|
||||
|
||||
# check the mobi header information is K8 or K7
|
||||
kn = 'k8ncx' if tmh.isK8() else 'k7ncx'
|
||||
mhdict[kn] = ncx_data
|
||||
|
||||
return mhdict
|
||||
|
||||
245
mobiparse/mobi/mobi_cover.py
Executable file
245
mobiparse/mobi/mobi_cover.py
Executable file
@@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
from .compatibility_utils import unicode_str
|
||||
from loguru import logger
|
||||
from .unipath import pathof
|
||||
import os
|
||||
import imghdr
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
USE_SVG_WRAPPER = True
|
||||
""" Set to True to use svg wrapper for default. """
|
||||
|
||||
FORCE_DEFAULT_TITLE = False
|
||||
""" Set to True to force to use the default title. """
|
||||
|
||||
COVER_PAGE_FINENAME = "cover_page.xhtml"
|
||||
""" The name for the cover page. """
|
||||
|
||||
DEFAULT_TITLE = "Cover"
|
||||
""" The default title for the cover page. """
|
||||
|
||||
MAX_WIDTH = 4096
|
||||
""" The max width for the svg cover page. """
|
||||
|
||||
MAX_HEIGHT = 4096
|
||||
""" The max height for the svg cover page. """
|
||||
|
||||
|
||||
def get_image_type(imgname, imgdata=None):
|
||||
imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
|
||||
|
||||
# imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
|
||||
# with only the magic JPEG bytes out there...
|
||||
# ImageMagick handles those, so, do it too.
|
||||
if imgtype is None:
|
||||
if imgdata is None:
|
||||
with open(pathof(imgname), "rb") as f:
|
||||
imgdata = f.read()
|
||||
if imgdata[0:2] == b"\xFF\xD8":
|
||||
# Get last non-null bytes
|
||||
last = len(imgdata)
|
||||
while imgdata[last - 1 : last] == b"\x00":
|
||||
last -= 1
|
||||
# Be extra safe, check the trailing bytes, too.
|
||||
if imgdata[last - 2 : last] == b"\xFF\xD9":
|
||||
imgtype = "jpeg"
|
||||
return imgtype
|
||||
|
||||
|
||||
def get_image_size(imgname, imgdata=None):
|
||||
"""Determine the image type of imgname (or imgdata) and return its size.
|
||||
|
||||
Originally,
|
||||
Determine the image type of fhandle and return its size.
|
||||
from draco"""
|
||||
if imgdata is None:
|
||||
fhandle = open(pathof(imgname), "rb")
|
||||
head = fhandle.read(24)
|
||||
else:
|
||||
head = imgdata[0:24]
|
||||
if len(head) != 24:
|
||||
return
|
||||
|
||||
imgtype = get_image_type(imgname, imgdata)
|
||||
if imgtype == "png":
|
||||
check = struct.unpack(b">i", head[4:8])[0]
|
||||
if check != 0x0D0A1A0A:
|
||||
return
|
||||
width, height = struct.unpack(b">ii", head[16:24])
|
||||
elif imgtype == "gif":
|
||||
width, height = struct.unpack(b"<HH", head[6:10])
|
||||
elif imgtype == "jpeg" and imgdata is None:
|
||||
try:
|
||||
fhandle.seek(0) # Read 0xff next
|
||||
size = 2
|
||||
ftype = 0
|
||||
while not 0xC0 <= ftype <= 0xCF:
|
||||
fhandle.seek(size, 1)
|
||||
byte = fhandle.read(1)
|
||||
while ord(byte) == 0xFF:
|
||||
byte = fhandle.read(1)
|
||||
ftype = ord(byte)
|
||||
size = struct.unpack(b">H", fhandle.read(2))[0] - 2
|
||||
# We are at a SOFn block
|
||||
fhandle.seek(1, 1) # Skip `precision' byte.
|
||||
height, width = struct.unpack(b">HH", fhandle.read(4))
|
||||
except Exception: # IGNORE:W0703
|
||||
return
|
||||
elif imgtype == "jpeg" and imgdata is not None:
|
||||
try:
|
||||
pos = 0
|
||||
size = 2
|
||||
ftype = 0
|
||||
while not 0xC0 <= ftype <= 0xCF:
|
||||
pos += size
|
||||
byte = imgdata[pos : pos + 1]
|
||||
pos += 1
|
||||
while ord(byte) == 0xFF:
|
||||
byte = imgdata[pos : pos + 1]
|
||||
pos += 1
|
||||
ftype = ord(byte)
|
||||
size = struct.unpack(b">H", imgdata[pos : pos + 2])[0] - 2
|
||||
pos += 2
|
||||
# We are at a SOFn block
|
||||
pos += 1 # Skip `precision' byte.
|
||||
height, width = struct.unpack(b">HH", imgdata[pos : pos + 4])
|
||||
pos += 4
|
||||
except Exception: # IGNORE:W0703
|
||||
return
|
||||
else:
|
||||
return
|
||||
return width, height
|
||||
|
||||
|
||||
# XXX experimental
|
||||
class CoverProcessor(object):
|
||||
|
||||
"""Create a cover page.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
|
||||
self.files = files
|
||||
self.metadata = metadata
|
||||
self.rscnames = rscnames
|
||||
self.cover_page = COVER_PAGE_FINENAME
|
||||
self.use_svg = USE_SVG_WRAPPER # Use svg wrapper.
|
||||
self.lang = metadata.get("Language", ["en"])[0]
|
||||
# This should ensure that if the methods to find the cover image's
|
||||
# dimensions should fail for any reason, the SVG routine will not be used.
|
||||
[self.width, self.height] = (-1, -1)
|
||||
if FORCE_DEFAULT_TITLE:
|
||||
self.title = DEFAULT_TITLE
|
||||
else:
|
||||
self.title = metadata.get("Title", [DEFAULT_TITLE])[0]
|
||||
|
||||
self.cover_image = None
|
||||
if imgname is not None:
|
||||
self.cover_image = imgname
|
||||
elif "CoverOffset" in metadata:
|
||||
imageNumber = int(metadata["CoverOffset"][0])
|
||||
cover_image = self.rscnames[imageNumber]
|
||||
if cover_image is not None:
|
||||
self.cover_image = cover_image
|
||||
else:
|
||||
logger.debug("Warning: Cannot identify the cover image.")
|
||||
if self.use_svg:
|
||||
try:
|
||||
if imgdata is None:
|
||||
fname = os.path.join(files.imgdir, self.cover_image)
|
||||
[self.width, self.height] = get_image_size(fname)
|
||||
else:
|
||||
[self.width, self.height] = get_image_size(None, imgdata)
|
||||
except:
|
||||
self.use_svg = False
|
||||
width = self.width
|
||||
height = self.height
|
||||
if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
|
||||
self.use_svg = False
|
||||
return
|
||||
|
||||
def getImageName(self):
|
||||
return self.cover_image
|
||||
|
||||
def getXHTMLName(self):
|
||||
return self.cover_page
|
||||
|
||||
def buildXHTML(self):
|
||||
logger.debug("Building a cover page.")
|
||||
files = self.files
|
||||
cover_image = self.cover_image
|
||||
title = self.title
|
||||
lang = self.lang
|
||||
|
||||
image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
|
||||
image_path = os.path.join(image_dir, cover_image).replace("\\", "/")
|
||||
|
||||
if not self.use_svg:
|
||||
data = ""
|
||||
data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
|
||||
data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"'
|
||||
data += ' xml:lang="{:s}">\n'.format(lang)
|
||||
data += "<head>\n<title>{:s}</title>\n".format(title)
|
||||
data += '<style type="text/css">\n'
|
||||
data += "body {\n margin: 0;\n padding: 0;\n text-align: center;\n}\n"
|
||||
data += "div {\n height: 100%;\n width: 100%;\n text-align: center;\n page-break-inside: avoid;\n}\n"
|
||||
data += "img {\n display: inline-block;\n height: 100%;\n margin: 0 auto;\n}\n"
|
||||
data += "</style>\n</head>\n"
|
||||
data += "<body><div>\n"
|
||||
data += ' <img src="{:s}" alt=""/>\n'.format(image_path)
|
||||
data += "</div></body>\n</html>"
|
||||
else:
|
||||
width = self.width
|
||||
height = self.height
|
||||
viewBox = "0 0 {0:d} {1:d}".format(width, height)
|
||||
|
||||
data = ""
|
||||
data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
|
||||
data += '<html xmlns="http://www.w3.org/1999/xhtml"'
|
||||
data += ' xml:lang="{:s}">\n'.format(lang)
|
||||
data += "<head>\n <title>{:s}</title>\n".format(title)
|
||||
data += '<style type="text/css">\n'
|
||||
data += "svg {padding: 0pt; margin:0pt}\n"
|
||||
data += "body { text-align: center; padding:0pt; margin: 0pt; }\n"
|
||||
data += "</style>\n</head>\n"
|
||||
data += "<body>\n <div>\n"
|
||||
data += ' <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"'
|
||||
data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(
|
||||
viewBox
|
||||
)
|
||||
data += ' <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(
|
||||
height, width, image_path
|
||||
)
|
||||
data += " </svg>\n"
|
||||
data += " </div>\n</body>\n</html>"
|
||||
return data
|
||||
|
||||
def writeXHTML(self):
|
||||
files = self.files
|
||||
cover_page = self.cover_page
|
||||
|
||||
data = self.buildXHTML()
|
||||
|
||||
outfile = os.path.join(files.k8text, cover_page)
|
||||
if os.path.exists(pathof(outfile)):
|
||||
logger.debug("Warning: {:s} already exists.".format(cover_page))
|
||||
os.remove(pathof(outfile))
|
||||
with open(pathof(outfile), "wb") as f:
|
||||
f.write(data.encode("utf-8"))
|
||||
return
|
||||
|
||||
def guide_toxml(self):
|
||||
files = self.files
|
||||
text_dir = os.path.relpath(files.k8text, files.k8oebps)
|
||||
data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format(
|
||||
text_dir, self.cover_page
|
||||
)
|
||||
return data
|
||||
473
mobiparse/mobi/mobi_dict.py
Executable file
473
mobiparse/mobi/mobi_dict.py
Executable file
@@ -0,0 +1,473 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
|
||||
from loguru import logger
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
array_format = b"B"
|
||||
if PY3:
|
||||
unichr = chr
|
||||
array_format = "B"
|
||||
|
||||
import array
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
|
||||
from .mobi_utils import toHex
|
||||
|
||||
DEBUG_DICT = True
|
||||
|
||||
class InflectionData(object):
|
||||
def __init__(self, infldatas):
|
||||
self.infldatas = infldatas
|
||||
self.starts = []
|
||||
self.counts = []
|
||||
for idata in self.infldatas:
|
||||
(start,) = struct.unpack_from(b">L", idata, 0x14)
|
||||
(count,) = struct.unpack_from(b">L", idata, 0x18)
|
||||
self.starts.append(start)
|
||||
self.counts.append(count)
|
||||
|
||||
def lookup(self, lookupvalue):
|
||||
i = 0
|
||||
rvalue = lookupvalue
|
||||
while rvalue >= self.counts[i]:
|
||||
rvalue = rvalue - self.counts[i]
|
||||
i += 1
|
||||
if i == len(self.counts):
|
||||
logger.debug("Error: Problem with multiple inflections data sections")
|
||||
return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
|
||||
return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
|
||||
|
||||
def offsets(self, value):
|
||||
rvalue, start, count, data = self.lookup(value)
|
||||
(offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue))
|
||||
if rvalue + 1 < count:
|
||||
(nextOffset,) = struct.unpack_from(
|
||||
b">H", data, start + 4 + (2 * (rvalue + 1))
|
||||
)
|
||||
else:
|
||||
nextOffset = None
|
||||
return offset, nextOffset, data
|
||||
|
||||
|
||||
class dictSupport(object):
|
||||
def __init__(self, mh, sect):
|
||||
self.mh = mh
|
||||
self.header = mh.header
|
||||
self.sect = sect
|
||||
self.metaOrthIndex = mh.metaOrthIndex
|
||||
self.metaInflIndex = mh.metaInflIndex
|
||||
|
||||
def parseHeader(self, data):
|
||||
"read INDX header"
|
||||
if not data[:4] == b"INDX":
|
||||
logger.debug("Warning: index section is not INDX")
|
||||
return False
|
||||
words = (
|
||||
"len",
|
||||
"nul1",
|
||||
"type",
|
||||
"gen",
|
||||
"start",
|
||||
"count",
|
||||
"code",
|
||||
"lng",
|
||||
"total",
|
||||
"ordt",
|
||||
"ligt",
|
||||
"nligt",
|
||||
"nctoc",
|
||||
)
|
||||
num = len(words)
|
||||
values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
|
||||
header = {}
|
||||
for n in range(num):
|
||||
header[words[n]] = values[n]
|
||||
|
||||
ordt1 = None
|
||||
ordt2 = None
|
||||
|
||||
otype, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
|
||||
header["otype"] = otype
|
||||
header["oentries"] = oentries
|
||||
|
||||
if DEBUG_DICT:
|
||||
logger.debug(
|
||||
"otype %d, oentries %d, op1 %d, op2 %d, otagx %d"
|
||||
% (otype, oentries, op1, op2, otagx)
|
||||
)
|
||||
|
||||
if header["code"] == 0xFDEA or oentries > 0:
|
||||
# some dictionaries seem to be codepage 65002 (0xFDEA) which seems
|
||||
# to be some sort of strange EBCDIC utf-8 or 16 encoded strings
|
||||
# So we need to look for them and store them away to process leading text
|
||||
# ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
|
||||
# we only ever seem to use the second but ...
|
||||
#
|
||||
# if otype = 0, ORDT table uses 16 bit values as offsets into the table
|
||||
# if otype = 1, ORDT table uses 8 bit values as offsets inot the table
|
||||
|
||||
assert data[op1 : op1 + 4] == b"ORDT"
|
||||
assert data[op2 : op2 + 4] == b"ORDT"
|
||||
ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
|
||||
ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)
|
||||
|
||||
if DEBUG_DICT:
|
||||
logger.debug("parsed INDX header:")
|
||||
for key in header:
|
||||
logger.debug(
|
||||
key, "%x" % header[key],
|
||||
)
|
||||
logger.debug("\n")
|
||||
return header, ordt1, ordt2
|
||||
|
||||
def getPositionMap(self):
|
||||
sect = self.sect
|
||||
|
||||
positionMap = {}
|
||||
|
||||
metaOrthIndex = self.metaOrthIndex
|
||||
metaInflIndex = self.metaInflIndex
|
||||
|
||||
decodeInflection = True
|
||||
if metaOrthIndex != 0xFFFFFFFF:
|
||||
logger.debug(
|
||||
"Info: Document contains orthographic index, handle as dictionary"
|
||||
)
|
||||
if metaInflIndex == 0xFFFFFFFF:
|
||||
decodeInflection = False
|
||||
else:
|
||||
metaInflIndexData = sect.loadSection(metaInflIndex)
|
||||
|
||||
logger.debug("\nParsing metaInflIndexData")
|
||||
midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
|
||||
|
||||
metaIndexCount = midxhdr["count"]
|
||||
idatas = []
|
||||
for j in range(metaIndexCount):
|
||||
idatas.append(sect.loadSection(metaInflIndex + 1 + j))
|
||||
dinfl = InflectionData(idatas)
|
||||
|
||||
inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
|
||||
tagSectionStart = midxhdr["len"]
|
||||
inflectionControlByteCount, inflectionTagTable = readTagSection(
|
||||
tagSectionStart, metaInflIndexData
|
||||
)
|
||||
if DEBUG_DICT:
|
||||
logger.debug("inflectionTagTable: %s" % inflectionTagTable)
|
||||
if self.hasTag(inflectionTagTable, 0x07):
|
||||
logger.debug(
|
||||
"Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
|
||||
)
|
||||
decodeInflection = False
|
||||
|
||||
data = sect.loadSection(metaOrthIndex)
|
||||
|
||||
logger.debug("\nParsing metaOrthIndex")
|
||||
idxhdr, hordt1, hordt2 = self.parseHeader(data)
|
||||
|
||||
tagSectionStart = idxhdr["len"]
|
||||
controlByteCount, tagTable = readTagSection(tagSectionStart, data)
|
||||
orthIndexCount = idxhdr["count"]
|
||||
logger.debug("orthIndexCount is", orthIndexCount)
|
||||
if DEBUG_DICT:
|
||||
logger.debug("orthTagTable: %s" % tagTable)
|
||||
if hordt2 is not None:
|
||||
logger.debug(
|
||||
"orth entry uses ordt2 lookup table of type ", idxhdr["otype"]
|
||||
)
|
||||
hasEntryLength = self.hasTag(tagTable, 0x02)
|
||||
if not hasEntryLength:
|
||||
logger.debug("Info: Index doesn't contain entry length tags")
|
||||
|
||||
logger.debug("Read dictionary index data")
|
||||
for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
|
||||
data = sect.loadSection(i)
|
||||
hdrinfo, ordt1, ordt2 = self.parseHeader(data)
|
||||
idxtPos = hdrinfo["start"]
|
||||
entryCount = hdrinfo["count"]
|
||||
idxPositions = []
|
||||
for j in range(entryCount):
|
||||
(pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
|
||||
idxPositions.append(pos)
|
||||
# The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
|
||||
idxPositions.append(idxtPos)
|
||||
for j in range(entryCount):
|
||||
startPos = idxPositions[j]
|
||||
endPos = idxPositions[j + 1]
|
||||
textLength = ord(data[startPos : startPos + 1])
|
||||
text = data[startPos + 1 : startPos + 1 + textLength]
|
||||
if hordt2 is not None:
|
||||
utext = ""
|
||||
if idxhdr["otype"] == 0:
|
||||
pattern = b">H"
|
||||
inc = 2
|
||||
else:
|
||||
pattern = b">B"
|
||||
inc = 1
|
||||
pos = 0
|
||||
while pos < textLength:
|
||||
(off,) = struct.unpack_from(pattern, text, pos)
|
||||
if off < len(hordt2):
|
||||
utext += unichr(hordt2[off])
|
||||
else:
|
||||
utext += unichr(off)
|
||||
pos += inc
|
||||
text = utext.encode("utf-8")
|
||||
|
||||
tagMap = getTagMap(
|
||||
controlByteCount,
|
||||
tagTable,
|
||||
data,
|
||||
startPos + 1 + textLength,
|
||||
endPos,
|
||||
)
|
||||
if 0x01 in tagMap:
|
||||
if decodeInflection and 0x2A in tagMap:
|
||||
inflectionGroups = self.getInflectionGroups(
|
||||
text,
|
||||
inflectionControlByteCount,
|
||||
inflectionTagTable,
|
||||
dinfl,
|
||||
inflNameData,
|
||||
tagMap[0x2A],
|
||||
)
|
||||
else:
|
||||
inflectionGroups = b""
|
||||
assert len(tagMap[0x01]) == 1
|
||||
entryStartPosition = tagMap[0x01][0]
|
||||
if hasEntryLength:
|
||||
# The idx:entry attribute "scriptable" must be present to create entry length tags.
|
||||
ml = (
|
||||
b'<idx:entry scriptable="yes"><idx:orth value="'
|
||||
+ text
|
||||
+ b'">'
|
||||
+ inflectionGroups
|
||||
+ b"</idx:orth>"
|
||||
)
|
||||
if entryStartPosition in positionMap:
|
||||
positionMap[entryStartPosition] = (
|
||||
positionMap[entryStartPosition] + ml
|
||||
)
|
||||
else:
|
||||
positionMap[entryStartPosition] = ml
|
||||
assert len(tagMap[0x02]) == 1
|
||||
entryEndPosition = entryStartPosition + tagMap[0x02][0]
|
||||
if entryEndPosition in positionMap:
|
||||
positionMap[entryEndPosition] = (
|
||||
b"</idx:entry>" + positionMap[entryEndPosition]
|
||||
)
|
||||
else:
|
||||
positionMap[entryEndPosition] = b"</idx:entry>"
|
||||
|
||||
else:
|
||||
indexTags = (
|
||||
b'<idx:entry>\n<idx:orth value="'
|
||||
+ text
|
||||
+ b'">\n'
|
||||
+ inflectionGroups
|
||||
+ b"</idx:entry>\n"
|
||||
)
|
||||
if entryStartPosition in positionMap:
|
||||
positionMap[entryStartPosition] = (
|
||||
positionMap[entryStartPosition] + indexTags
|
||||
)
|
||||
else:
|
||||
positionMap[entryStartPosition] = indexTags
|
||||
return positionMap
|
||||
|
||||
def hasTag(self, tagTable, tag):
|
||||
"""
|
||||
Test if tag table contains given tag.
|
||||
|
||||
@param tagTable: The tag table.
|
||||
@param tag: The tag to search.
|
||||
@return: True if tag table contains given tag; False otherwise.
|
||||
"""
|
||||
for currentTag, _, _, _ in tagTable:
|
||||
if currentTag == tag:
|
||||
return True
|
||||
return False
|
||||
|
||||
def getInflectionGroups(
|
||||
self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList
|
||||
):
|
||||
"""
|
||||
Create string which contains the inflection groups with inflection rules as mobipocket tags.
|
||||
|
||||
@param mainEntry: The word to inflect.
|
||||
@param controlByteCount: The number of control bytes.
|
||||
@param tagTable: The tag table.
|
||||
@param data: The Inflection data object to properly select the right inflection data section to use
|
||||
@param inflectionNames: The inflection rule name data.
|
||||
@param groupList: The list of inflection groups to process.
|
||||
@return: String with inflection groups and rules or empty string if required tags are not available.
|
||||
"""
|
||||
result = b""
|
||||
for value in groupList:
|
||||
offset, nextOffset, data = dinfl.offsets(value)
|
||||
|
||||
# First byte seems to be always 0x00 and must be skipped.
|
||||
assert ord(data[offset : offset + 1]) == 0x00
|
||||
tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
|
||||
|
||||
# Make sure that the required tags are available.
|
||||
if 0x05 not in tagMap:
|
||||
logger.debug("Error: Required tag 0x05 not found in tagMap")
|
||||
return ""
|
||||
if 0x1A not in tagMap:
|
||||
logger.debug("Error: Required tag 0x1a not found in tagMap")
|
||||
return b""
|
||||
|
||||
result += b"<idx:infl>"
|
||||
|
||||
for i in range(len(tagMap[0x05])):
|
||||
|
||||
# Get name of inflection rule.
|
||||
value = tagMap[0x05][i]
|
||||
consumed, textLength = getVariableWidthValue(inflectionNames, value)
|
||||
inflectionName = inflectionNames[
|
||||
value + consumed : value + consumed + textLength
|
||||
]
|
||||
|
||||
# Get and apply inflection rule across possibly multiple inflection data sections
|
||||
value = tagMap[0x1A][i]
|
||||
rvalue, start, count, data = dinfl.lookup(value)
|
||||
(offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue))
|
||||
textLength = ord(data[offset : offset + 1])
|
||||
inflection = self.applyInflectionRule(
|
||||
mainEntry, data, offset + 1, offset + 1 + textLength
|
||||
)
|
||||
if inflection is not None:
|
||||
result += (
|
||||
b' <idx:iform name="'
|
||||
+ inflectionName
|
||||
+ b'" value="'
|
||||
+ inflection
|
||||
+ b'"/>'
|
||||
)
|
||||
|
||||
result += b"</idx:infl>"
|
||||
return result
|
||||
|
||||
def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
|
||||
"""
|
||||
Apply inflection rule.
|
||||
|
||||
@param mainEntry: The word to inflect.
|
||||
@param inflectionRuleData: The inflection rules.
|
||||
@param start: The start position of the inflection rule to use.
|
||||
@param end: The end position of the inflection rule to use.
|
||||
@return: The string with the inflected word or None if an error occurs.
|
||||
"""
|
||||
mode = -1
|
||||
byteArray = array.array(array_format, mainEntry)
|
||||
position = len(byteArray)
|
||||
for charOffset in range(start, end):
|
||||
char = inflectionRuleData[charOffset : charOffset + 1]
|
||||
abyte = ord(char)
|
||||
if abyte >= 0x0A and abyte <= 0x13:
|
||||
# Move cursor backwards
|
||||
offset = abyte - 0x0A
|
||||
if mode not in [0x02, 0x03]:
|
||||
mode = 0x02
|
||||
position = len(byteArray)
|
||||
position -= offset
|
||||
elif abyte > 0x13:
|
||||
if mode == -1:
|
||||
logger.debug(
|
||||
"Error: Unexpected first byte %i of inflection rule" % abyte
|
||||
)
|
||||
return None
|
||||
elif position == -1:
|
||||
logger.debug(
|
||||
"Error: Unexpected first byte %i of inflection rule" % abyte
|
||||
)
|
||||
return None
|
||||
else:
|
||||
if mode == 0x01:
|
||||
# Insert at word start
|
||||
byteArray.insert(position, abyte)
|
||||
position += 1
|
||||
elif mode == 0x02:
|
||||
# Insert at word end
|
||||
byteArray.insert(position, abyte)
|
||||
elif mode == 0x03:
|
||||
# Delete at word end
|
||||
position -= 1
|
||||
deleted = byteArray.pop(position)
|
||||
if bchr(deleted) != char:
|
||||
if DEBUG_DICT:
|
||||
logger.debug(
|
||||
"0x03: %s %s %s %s"
|
||||
% (
|
||||
mainEntry,
|
||||
toHex(inflectionRuleData[start:end]),
|
||||
char,
|
||||
bchr(deleted),
|
||||
)
|
||||
)
|
||||
logger.debug(
|
||||
"Error: Delete operation of inflection rule failed"
|
||||
)
|
||||
return None
|
||||
elif mode == 0x04:
|
||||
# Delete at word start
|
||||
deleted = byteArray.pop(position)
|
||||
if bchr(deleted) != char:
|
||||
if DEBUG_DICT:
|
||||
logger.debug(
|
||||
"0x03: %s %s %s %s"
|
||||
% (
|
||||
mainEntry,
|
||||
toHex(inflectionRuleData[start:end]),
|
||||
char,
|
||||
bchr(deleted),
|
||||
)
|
||||
)
|
||||
logger.debug(
|
||||
"Error: Delete operation of inflection rule failed"
|
||||
)
|
||||
return None
|
||||
else:
|
||||
logger.debug(
|
||||
"Error: Inflection rule mode %x is not implemented" % mode
|
||||
)
|
||||
return None
|
||||
elif abyte == 0x01:
|
||||
# Insert at word start
|
||||
if mode not in [0x01, 0x04]:
|
||||
position = 0
|
||||
mode = abyte
|
||||
elif abyte == 0x02:
|
||||
# Insert at word end
|
||||
if mode not in [0x02, 0x03]:
|
||||
position = len(byteArray)
|
||||
mode = abyte
|
||||
elif abyte == 0x03:
|
||||
# Delete at word end
|
||||
if mode not in [0x02, 0x03]:
|
||||
position = len(byteArray)
|
||||
mode = abyte
|
||||
elif abyte == 0x04:
|
||||
# Delete at word start
|
||||
if mode not in [0x01, 0x04]:
|
||||
position = 0
|
||||
# Delete at word start
|
||||
mode = abyte
|
||||
else:
|
||||
logger.debug(
|
||||
"Error: Inflection rule mode %x is not implemented" % abyte
|
||||
)
|
||||
return None
|
||||
return utf8_str(byteArray.tostring())
|
||||
1032
mobiparse/mobi/mobi_header.py
Executable file
1032
mobiparse/mobi/mobi_header.py
Executable file
File diff suppressed because it is too large
Load Diff
516
mobiparse/mobi/mobi_html.py
Executable file
516
mobiparse/mobi/mobi_html.py
Executable file
@@ -0,0 +1,516 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, utf8_str
|
||||
from loguru import logger
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
import re
|
||||
|
||||
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||
# but u"" is not allowed for the pattern itself only b""
|
||||
|
||||
from .mobi_utils import fromBase32
|
||||
|
||||
|
||||
class HTMLProcessor:
|
||||
def __init__(self, files, metadata, rscnames):
|
||||
self.files = files
|
||||
self.metadata = metadata
|
||||
self.rscnames = rscnames
|
||||
# for original style mobis, default to including all image files in the opf manifest
|
||||
self.used = {}
|
||||
for name in rscnames:
|
||||
self.used[name] = "used"
|
||||
|
||||
def findAnchors(self, rawtext, indx_data, positionMap):
|
||||
# process the raw text
|
||||
# find anchors...
|
||||
logger.debug("Find link anchors")
|
||||
link_pattern = re.compile(
|
||||
br"""<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>""", re.IGNORECASE
|
||||
)
|
||||
# TEST NCX: merge in filepos from indx
|
||||
pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
|
||||
if indx_data:
|
||||
pos_indx = [e["pos"] for e in indx_data if e["pos"] > 0]
|
||||
pos_links = list(set(pos_links + pos_indx))
|
||||
|
||||
for position in pos_links:
|
||||
if position in positionMap:
|
||||
positionMap[position] = positionMap[position] + utf8_str(
|
||||
'<a id="filepos%d" />' % position
|
||||
)
|
||||
else:
|
||||
positionMap[position] = utf8_str('<a id="filepos%d" />' % position)
|
||||
|
||||
# apply dictionary metadata and anchors
|
||||
logger.debug("Insert data into html")
|
||||
pos = 0
|
||||
lastPos = len(rawtext)
|
||||
dataList = []
|
||||
for end in sorted(positionMap.keys()):
|
||||
if end == 0 or end > lastPos:
|
||||
continue # something's up - can't put a tag in outside <html>...</html>
|
||||
dataList.append(rawtext[pos:end])
|
||||
dataList.append(positionMap[end])
|
||||
pos = end
|
||||
dataList.append(rawtext[pos:])
|
||||
srctext = b"".join(dataList)
|
||||
rawtext = None
|
||||
dataList = None
|
||||
self.srctext = srctext
|
||||
self.indx_data = indx_data
|
||||
return srctext
|
||||
|
||||
def insertHREFS(self):
|
||||
srctext = self.srctext
|
||||
rscnames = self.rscnames
|
||||
metadata = self.metadata
|
||||
|
||||
# put in the hrefs
|
||||
logger.debug("Insert hrefs into html")
|
||||
# There doesn't seem to be a standard, so search as best as we can
|
||||
|
||||
link_pattern = re.compile(
|
||||
br"""<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>""", re.IGNORECASE
|
||||
)
|
||||
srctext = link_pattern.sub(br"""<a\1href="#filepos\2"\3>""", srctext)
|
||||
|
||||
# remove empty anchors
|
||||
logger.debug("Remove empty anchors from html")
|
||||
srctext = re.sub(br"<a\s*/>", br"", srctext)
|
||||
srctext = re.sub(br"<a\s*>\s*</a>", br"", srctext)
|
||||
|
||||
# convert image references
|
||||
logger.debug("Insert image references into html")
|
||||
# split string into image tag pieces and other pieces
|
||||
image_pattern = re.compile(br"""(<img.*?>)""", re.IGNORECASE)
|
||||
image_index_pattern = re.compile(
|
||||
br"""recindex=['"]{0,1}([0-9]+)['"]{0,1}""", re.IGNORECASE
|
||||
)
|
||||
srcpieces = image_pattern.split(srctext)
|
||||
srctext = self.srctext = None
|
||||
|
||||
# all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
|
||||
for i in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[i]
|
||||
for m in image_index_pattern.finditer(tag):
|
||||
imageNumber = int(m.group(1))
|
||||
imageName = rscnames[imageNumber - 1]
|
||||
if imageName is None:
|
||||
logger.debug(
|
||||
"Error: Referenced image %s was not recognized as a valid image"
|
||||
% imageNumber
|
||||
)
|
||||
else:
|
||||
replacement = b'src="Images/' + utf8_str(imageName) + b'"'
|
||||
tag = image_index_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[i] = tag
|
||||
srctext = b"".join(srcpieces)
|
||||
|
||||
# add in character set meta into the html header if needed
|
||||
if "Codec" in metadata:
|
||||
srctext = (
|
||||
srctext[0:12]
|
||||
+ b'<meta http-equiv="content-type" content="text/html; charset='
|
||||
+ utf8_str(metadata.get("Codec")[0])
|
||||
+ b'" />'
|
||||
+ srctext[12:]
|
||||
)
|
||||
return srctext, self.used
|
||||
|
||||
|
||||
class XHTMLK8Processor:
|
||||
def __init__(self, rscnames, k8proc):
|
||||
self.rscnames = rscnames
|
||||
self.k8proc = k8proc
|
||||
self.used = {}
|
||||
|
||||
def buildXHTML(self):
|
||||
|
||||
# first need to update all links that are internal which
|
||||
# are based on positions within the xhtml files **BEFORE**
|
||||
# cutting and pasting any pieces into the xhtml text files
|
||||
|
||||
# kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
|
||||
# XXXX is the offset in records into divtbl
|
||||
# YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
|
||||
|
||||
# pos:fid pattern
|
||||
posfid_pattern = re.compile(br"""(<a.*?href=.*?>)""", re.IGNORECASE)
|
||||
posfid_index_pattern = re.compile(
|
||||
br"""['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']"""
|
||||
)
|
||||
|
||||
parts = []
|
||||
logger.debug("Building proper xhtml for each file")
|
||||
for i in range(self.k8proc.getNumberOfParts()):
|
||||
part = self.k8proc.getPart(i)
|
||||
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
|
||||
|
||||
# internal links
|
||||
srcpieces = posfid_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith(b"<"):
|
||||
for m in posfid_index_pattern.finditer(tag):
|
||||
posfid = m.group(1)
|
||||
offset = m.group(2)
|
||||
filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
|
||||
if idtag == b"":
|
||||
replacement = b'"' + utf8_str(filename) + b'"'
|
||||
else:
|
||||
replacement = (
|
||||
b'"' + utf8_str(filename) + b"#" + idtag + b'"'
|
||||
)
|
||||
tag = posfid_index_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[j] = tag
|
||||
part = b"".join(srcpieces)
|
||||
parts.append(part)
|
||||
|
||||
# we are free to cut and paste as we see fit
|
||||
# we can safely remove all of the Kindlegen generated aid tags
|
||||
# change aid ids that are in k8proc.linked_aids to xhtml ids
|
||||
find_tag_with_aid_pattern = re.compile(
|
||||
br"""(<[^>]*\said\s*=[^>]*>)""", re.IGNORECASE
|
||||
)
|
||||
within_tag_aid_position_pattern = re.compile(br"""\said\s*=['"]([^'"]*)['"]""")
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
srcpieces = find_tag_with_aid_pattern.split(part)
|
||||
for j in range(len(srcpieces)):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith(b"<"):
|
||||
for m in within_tag_aid_position_pattern.finditer(tag):
|
||||
try:
|
||||
aid = m.group(1)
|
||||
except IndexError:
|
||||
aid = None
|
||||
replacement = b""
|
||||
if aid in self.k8proc.linked_aids:
|
||||
replacement = b' id="aid-' + aid + b'"'
|
||||
tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
|
||||
srcpieces[j] = tag
|
||||
part = b"".join(srcpieces)
|
||||
parts[i] = part
|
||||
|
||||
# we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
|
||||
# with page-break-after style patterns
|
||||
find_tag_with_AmznPageBreak_pattern = re.compile(
|
||||
br"""(<[^>]*\sdata-AmznPageBreak=[^>]*>)""", re.IGNORECASE
|
||||
)
|
||||
within_tag_AmznPageBreak_position_pattern = re.compile(
|
||||
br"""\sdata-AmznPageBreak=['"]([^'"]*)['"]"""
|
||||
)
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
|
||||
for j in range(len(srcpieces)):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith(b"<"):
|
||||
srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
|
||||
lambda m: b' style="page-break-after:' + m.group(1) + b'"', tag
|
||||
)
|
||||
part = b"".join(srcpieces)
|
||||
parts[i] = part
|
||||
|
||||
# we have to handle substitutions for the flows pieces first as they may
|
||||
# be inlined into the xhtml text
|
||||
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
||||
# kindle:embed:XXXX (used for fonts)
|
||||
|
||||
flows = []
|
||||
flows.append(None)
|
||||
flowinfo = []
|
||||
flowinfo.append([None, None, None, None])
|
||||
|
||||
# regular expression search patterns
|
||||
img_pattern = re.compile(br"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE)
|
||||
img_index_pattern = re.compile(
|
||||
br"""[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]""", re.IGNORECASE
|
||||
)
|
||||
|
||||
tag_pattern = re.compile(br"""(<[^>]*>)""")
|
||||
flow_pattern = re.compile(
|
||||
br"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE
|
||||
)
|
||||
|
||||
url_pattern = re.compile(br"""(url\(.*?\))""", re.IGNORECASE)
|
||||
url_img_index_pattern = re.compile(
|
||||
br"""[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]""",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
font_index_pattern = re.compile(
|
||||
br"""[('"]kindle:embed:([0-9|A-V]+)["')]""", re.IGNORECASE
|
||||
)
|
||||
url_css_index_pattern = re.compile(
|
||||
br"""kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*""", re.IGNORECASE
|
||||
)
|
||||
url_svg_image_pattern = re.compile(
|
||||
br"""kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*""", re.IGNORECASE
|
||||
)
|
||||
|
||||
for i in range(1, self.k8proc.getNumberOfFlows()):
|
||||
[ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
|
||||
flowpart = self.k8proc.getFlow(i)
|
||||
|
||||
# links to raster image files from image tags
|
||||
# image_pattern
|
||||
srcpieces = img_pattern.split(flowpart)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith(b"<im"):
|
||||
for m in img_index_pattern.finditer(tag):
|
||||
imageNumber = fromBase32(m.group(1))
|
||||
imageName = self.rscnames[imageNumber - 1]
|
||||
if imageName is not None:
|
||||
replacement = b'"../Images/' + utf8_str(imageName) + b'"'
|
||||
self.used[imageName] = "used"
|
||||
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
logger.debug(
|
||||
"Error: Referenced image %s was not recognized as a valid image in %s"
|
||||
% (imageNumber, tag)
|
||||
)
|
||||
srcpieces[j] = tag
|
||||
flowpart = b"".join(srcpieces)
|
||||
|
||||
# replacements inside css url():
|
||||
srcpieces = url_pattern.split(flowpart)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
|
||||
# process links to raster image files
|
||||
for m in url_img_index_pattern.finditer(tag):
|
||||
imageNumber = fromBase32(m.group(1))
|
||||
imageName = self.rscnames[imageNumber - 1]
|
||||
osep = m.group()[0:1]
|
||||
csep = m.group()[-1:]
|
||||
if imageName is not None:
|
||||
replacement = osep + b"../Images/" + utf8_str(imageName) + csep
|
||||
self.used[imageName] = "used"
|
||||
tag = url_img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
logger.debug(
|
||||
"Error: Referenced image %s was not recognized as a valid image in %s"
|
||||
% (imageNumber, tag)
|
||||
)
|
||||
|
||||
# process links to fonts
|
||||
for m in font_index_pattern.finditer(tag):
|
||||
fontNumber = fromBase32(m.group(1))
|
||||
fontName = self.rscnames[fontNumber - 1]
|
||||
osep = m.group()[0:1]
|
||||
csep = m.group()[-1:]
|
||||
if fontName is None:
|
||||
logger.debug(
|
||||
"Error: Referenced font %s was not recognized as a valid font in %s"
|
||||
% (fontNumber, tag)
|
||||
)
|
||||
else:
|
||||
replacement = osep + b"../Fonts/" + utf8_str(fontName) + csep
|
||||
tag = font_index_pattern.sub(replacement, tag, 1)
|
||||
self.used[fontName] = "used"
|
||||
|
||||
# process links to other css pieces
|
||||
for m in url_css_index_pattern.finditer(tag):
|
||||
num = fromBase32(m.group(1))
|
||||
[typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
|
||||
replacement = b'"../' + utf8_str(pdir) + b"/" + utf8_str(fnm) + b'"'
|
||||
tag = url_css_index_pattern.sub(replacement, tag, 1)
|
||||
self.used[fnm] = "used"
|
||||
|
||||
# process links to svg images
|
||||
for m in url_svg_image_pattern.finditer(tag):
|
||||
num = fromBase32(m.group(1))
|
||||
[typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
|
||||
replacement = b'"../' + utf8_str(pdir) + b"/" + utf8_str(fnm) + b'"'
|
||||
tag = url_svg_image_pattern.sub(replacement, tag, 1)
|
||||
self.used[fnm] = "used"
|
||||
|
||||
srcpieces[j] = tag
|
||||
flowpart = b"".join(srcpieces)
|
||||
|
||||
# store away in our own copy
|
||||
flows.append(flowpart)
|
||||
|
||||
# I do not think this case exists and even if it does exist, it needs to be done in a separate
|
||||
# pass to prevent inlining a flow piece into another flow piece before the inserted one or the
|
||||
# target one has been fully processed
|
||||
|
||||
# but keep it around if it ends up we do need it
|
||||
|
||||
# flow pattern not inside url()
|
||||
# srcpieces = tag_pattern.split(flowpart)
|
||||
# for j in range(1, len(srcpieces),2):
|
||||
# tag = srcpieces[j]
|
||||
# if tag.startswith(b'<'):
|
||||
# for m in flow_pattern.finditer(tag):
|
||||
# num = fromBase32(m.group(1))
|
||||
# [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
|
||||
# flowtext = self.k8proc.getFlow(num)
|
||||
# if fmt == b'inline':
|
||||
# tag = flowtext
|
||||
# else:
|
||||
# replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
|
||||
# tag = flow_pattern.sub(replacement, tag, 1)
|
||||
# self.used[fnm] = 'used'
|
||||
# srcpieces[j] = tag
|
||||
# flowpart = b"".join(srcpieces)
|
||||
|
||||
# now handle the main text xhtml parts
|
||||
|
||||
# Handle the flow items in the XHTML text pieces
|
||||
# kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
|
||||
tag_pattern = re.compile(br"""(<[^>]*>)""")
|
||||
flow_pattern = re.compile(
|
||||
br"""['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]""", re.IGNORECASE
|
||||
)
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||
# flow pattern
|
||||
srcpieces = tag_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith(b"<"):
|
||||
for m in flow_pattern.finditer(tag):
|
||||
num = fromBase32(m.group(1))
|
||||
if num > 0 and num < len(self.k8proc.flowinfo):
|
||||
[typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
|
||||
flowpart = flows[num]
|
||||
if fmt == b"inline":
|
||||
tag = flowpart
|
||||
else:
|
||||
replacement = (
|
||||
b'"../'
|
||||
+ utf8_str(pdir)
|
||||
+ b"/"
|
||||
+ utf8_str(fnm)
|
||||
+ b'"'
|
||||
)
|
||||
tag = flow_pattern.sub(replacement, tag, 1)
|
||||
self.used[fnm] = "used"
|
||||
else:
|
||||
print(
|
||||
"warning: ignoring non-existent flow link",
|
||||
tag,
|
||||
" value 0x%x" % num,
|
||||
)
|
||||
srcpieces[j] = tag
|
||||
part = b"".join(srcpieces)
|
||||
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
# Handle any embedded raster images links in style= attributes urls
|
||||
style_pattern = re.compile(
|
||||
br"""(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)""", re.IGNORECASE
|
||||
)
|
||||
img_index_pattern = re.compile(
|
||||
br"""[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]""", re.IGNORECASE
|
||||
)
|
||||
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||
|
||||
# replace urls in style attributes
|
||||
srcpieces = style_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if b"kindle:embed" in tag:
|
||||
for m in img_index_pattern.finditer(tag):
|
||||
imageNumber = fromBase32(m.group(1))
|
||||
imageName = self.rscnames[imageNumber - 1]
|
||||
osep = m.group()[0:1]
|
||||
csep = m.group()[-1:]
|
||||
if imageName is not None:
|
||||
replacement = (
|
||||
osep + b"../Images/" + utf8_str(imageName) + csep
|
||||
)
|
||||
self.used[imageName] = "used"
|
||||
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
logger.debug(
|
||||
"Error: Referenced image %s in style url was not recognized in %s"
|
||||
% (imageNumber, tag)
|
||||
)
|
||||
srcpieces[j] = tag
|
||||
part = b"".join(srcpieces)
|
||||
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
# Handle any embedded raster images links in the xhtml text
|
||||
# kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
|
||||
img_pattern = re.compile(br"""(<[img\s|image\s][^>]*>)""", re.IGNORECASE)
|
||||
img_index_pattern = re.compile(br"""['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]""")
|
||||
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||
|
||||
# links to raster image files
|
||||
# image_pattern
|
||||
srcpieces = img_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith(b"<im"):
|
||||
for m in img_index_pattern.finditer(tag):
|
||||
imageNumber = fromBase32(m.group(1))
|
||||
imageName = self.rscnames[imageNumber - 1]
|
||||
if imageName is not None:
|
||||
replacement = b'"../Images/' + utf8_str(imageName) + b'"'
|
||||
self.used[imageName] = "used"
|
||||
tag = img_index_pattern.sub(replacement, tag, 1)
|
||||
else:
|
||||
logger.debug(
|
||||
"Error: Referenced image %s was not recognized as a valid image in %s"
|
||||
% (imageNumber, tag)
|
||||
)
|
||||
srcpieces[j] = tag
|
||||
part = b"".join(srcpieces)
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
# finally perform any general cleanups needed to make valid XHTML
|
||||
# these include:
|
||||
# in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
|
||||
# in svg tags replace "viewbox" attributes with "viewBox"
|
||||
# in <li> remove value="XX" attributes since these are illegal
|
||||
tag_pattern = re.compile(br"""(<[^>]*>)""")
|
||||
li_value_pattern = re.compile(
|
||||
br"""\svalue\s*=\s*['"][^'"]*['"]""", re.IGNORECASE
|
||||
)
|
||||
|
||||
for i in range(len(parts)):
|
||||
part = parts[i]
|
||||
[partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
|
||||
|
||||
# tag pattern
|
||||
srcpieces = tag_pattern.split(part)
|
||||
for j in range(1, len(srcpieces), 2):
|
||||
tag = srcpieces[j]
|
||||
if tag.startswith(b"<svg") or tag.startswith(b"<SVG"):
|
||||
tag = tag.replace(b"preserveaspectratio", b"preserveAspectRatio")
|
||||
tag = tag.replace(b"viewbox", b"viewBox")
|
||||
elif tag.startswith(b"<li ") or tag.startswith(b"<LI "):
|
||||
tagpieces = li_value_pattern.split(tag)
|
||||
tag = b"".join(tagpieces)
|
||||
srcpieces[j] = tag
|
||||
part = b"".join(srcpieces)
|
||||
# store away modified version
|
||||
parts[i] = part
|
||||
|
||||
self.k8proc.setFlows(flows)
|
||||
self.k8proc.setParts(parts)
|
||||
|
||||
return self.used
|
||||
327
mobiparse/mobi/mobi_index.py
Executable file
327
mobiparse/mobi/mobi_index.py
Executable file
@@ -0,0 +1,327 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, bchr, bstr, bord
|
||||
from loguru import logger
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
from .mobi_utils import toHex
|
||||
|
||||
|
||||
class MobiIndex:
|
||||
# CGDBG
|
||||
def __init__(self, sect, DEBUG=True):
|
||||
self.sect = sect
|
||||
self.DEBUG = DEBUG
|
||||
|
||||
def getIndexData(self, idx, label="Unknown"):
|
||||
sect = self.sect
|
||||
outtbl = []
|
||||
ctoc_text = {}
|
||||
if idx != 0xFFFFFFFF:
|
||||
sect.setsectiondescription(idx, "{0} Main INDX section".format(label))
|
||||
data = sect.loadSection(idx)
|
||||
idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
|
||||
IndexCount = idxhdr["count"]
|
||||
# handle the case of multiple sections used for CTOC
|
||||
rec_off = 0
|
||||
off = idx + IndexCount + 1
|
||||
for j in range(idxhdr["nctoc"]):
|
||||
cdata = sect.loadSection(off + j)
|
||||
sect.setsectiondescription(off + j, label + " CTOC Data " + str(j))
|
||||
ctocdict = self.readCTOC(cdata)
|
||||
for k in ctocdict:
|
||||
ctoc_text[k + rec_off] = ctocdict[k]
|
||||
rec_off += 0x10000
|
||||
tagSectionStart = idxhdr["len"]
|
||||
controlByteCount, tagTable = readTagSection(tagSectionStart, data)
|
||||
if self.DEBUG:
|
||||
logger.debug("ControlByteCount is", controlByteCount)
|
||||
logger.debug("IndexCount is", IndexCount)
|
||||
logger.debug("TagTable: %s" % tagTable)
|
||||
for i in range(idx + 1, idx + 1 + IndexCount):
|
||||
sect.setsectiondescription(
|
||||
i, "{0} Extra {1:d} INDX section".format(label, i - idx)
|
||||
)
|
||||
data = sect.loadSection(i)
|
||||
hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
|
||||
idxtPos = hdrinfo["start"]
|
||||
entryCount = hdrinfo["count"]
|
||||
if self.DEBUG:
|
||||
logger.debug("%s %s" % (idxtPos, entryCount))
|
||||
# loop through to build up the IDXT position starts
|
||||
idxPositions = []
|
||||
for j in range(entryCount):
|
||||
(pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
|
||||
idxPositions.append(pos)
|
||||
# The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
|
||||
idxPositions.append(idxtPos)
|
||||
# for each entry in the IDXT build up the tagMap and any associated text
|
||||
for j in range(entryCount):
|
||||
startPos = idxPositions[j]
|
||||
endPos = idxPositions[j + 1]
|
||||
textLength = ord(data[startPos : startPos + 1])
|
||||
text = data[startPos + 1 : startPos + 1 + textLength]
|
||||
if hordt2 is not None:
|
||||
text = b"".join(bchr(hordt2[bord(x)]) for x in text)
|
||||
tagMap = getTagMap(
|
||||
controlByteCount,
|
||||
tagTable,
|
||||
data,
|
||||
startPos + 1 + textLength,
|
||||
endPos,
|
||||
)
|
||||
outtbl.append([text, tagMap])
|
||||
if self.DEBUG:
|
||||
# CGDBG
|
||||
logger.debug('tagMap {}'.format(tagMap))
|
||||
logger.debug('text {}'.format(text))
|
||||
logger.debug('data {}'.format(data))
|
||||
|
||||
return outtbl, ctoc_text
|
||||
|
||||
def parseINDXHeader(self, data):
|
||||
"read INDX header"
|
||||
if not data[:4] == b"INDX":
|
||||
logger.debug("Warning: index section is not INDX")
|
||||
return False
|
||||
words = (
|
||||
"len",
|
||||
"nul1",
|
||||
"type",
|
||||
"gen",
|
||||
"start",
|
||||
"count",
|
||||
"code",
|
||||
"lng",
|
||||
"total",
|
||||
"ordt",
|
||||
"ligt",
|
||||
"nligt",
|
||||
"nctoc",
|
||||
)
|
||||
num = len(words)
|
||||
values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
|
||||
header = {}
|
||||
for n in range(num):
|
||||
header[words[n]] = values[n]
|
||||
|
||||
ordt1 = None
|
||||
ordt2 = None
|
||||
|
||||
ocnt, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
|
||||
if header["code"] == 0xFDEA or ocnt != 0 or oentries > 0:
|
||||
# horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
|
||||
# them in the proper place in the header. They seem to be codepage 65002 which seems
|
||||
# to be some sort of strange EBCDIC utf-8 or 16 encoded strings
|
||||
|
||||
# so we need to look for them and store them away to process leading text
|
||||
# ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
|
||||
# we only ever seem to use the seocnd but ...
|
||||
assert ocnt == 1
|
||||
assert data[op1 : op1 + 4] == b"ORDT"
|
||||
assert data[op2 : op2 + 4] == b"ORDT"
|
||||
ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
|
||||
ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)
|
||||
|
||||
if self.DEBUG:
|
||||
logger.debug("parsed INDX header:")
|
||||
for n in words:
|
||||
print(
|
||||
n, "%X" % header[n],
|
||||
)
|
||||
logger.debug("")
|
||||
return header, ordt1, ordt2
|
||||
|
||||
def readCTOC(self, txtdata):
|
||||
# read all blocks from CTOC
|
||||
ctoc_data = {}
|
||||
offset = 0
|
||||
while offset < len(txtdata):
|
||||
if PY2:
|
||||
if txtdata[offset] == b"\0":
|
||||
break
|
||||
else:
|
||||
if txtdata[offset] == 0:
|
||||
break
|
||||
idx_offs = offset
|
||||
# first n bytes: name len as vwi
|
||||
pos, ilen = getVariableWidthValue(txtdata, offset)
|
||||
offset += pos
|
||||
# <len> next bytes: name
|
||||
name = txtdata[offset : offset + ilen]
|
||||
offset += ilen
|
||||
if self.DEBUG:
|
||||
logger.debug("name length is %s" % ilen)
|
||||
logger.debug("%s %s", (idx_offs, name))
|
||||
ctoc_data[idx_offs] = name
|
||||
return ctoc_data
|
||||
|
||||
|
||||
def getVariableWidthValue(data, offset):
|
||||
"""
|
||||
Decode variable width value from given bytes.
|
||||
|
||||
@param data: The bytes to decode.
|
||||
@param offset: The start offset into data.
|
||||
@return: Tuple of consumed bytes count and decoded value.
|
||||
"""
|
||||
value = 0
|
||||
consumed = 0
|
||||
finished = False
|
||||
while not finished:
|
||||
v = data[offset + consumed : offset + consumed + 1]
|
||||
consumed += 1
|
||||
if ord(v) & 0x80:
|
||||
finished = True
|
||||
value = (value << 7) | (ord(v) & 0x7F)
|
||||
return consumed, value
|
||||
|
||||
|
||||
def readTagSection(start, data):
|
||||
"""
|
||||
Read tag section from given data.
|
||||
|
||||
@param start: The start position in the data.
|
||||
@param data: The data to process.
|
||||
@return: Tuple of control byte count and list of tag tuples.
|
||||
"""
|
||||
controlByteCount = 0
|
||||
tags = []
|
||||
if data[start : start + 4] == b"TAGX":
|
||||
(firstEntryOffset,) = struct.unpack_from(b">L", data, start + 0x04)
|
||||
(controlByteCount,) = struct.unpack_from(b">L", data, start + 0x08)
|
||||
|
||||
# Skip the first 12 bytes already read above.
|
||||
for i in range(12, firstEntryOffset, 4):
|
||||
pos = start + i
|
||||
tags.append(
|
||||
(
|
||||
ord(data[pos : pos + 1]),
|
||||
ord(data[pos + 1 : pos + 2]),
|
||||
ord(data[pos + 2 : pos + 3]),
|
||||
ord(data[pos + 3 : pos + 4]),
|
||||
)
|
||||
)
|
||||
return controlByteCount, tags
|
||||
|
||||
|
||||
def countSetBits(value, bits=8):
|
||||
"""
|
||||
Count the set bits in the given value.
|
||||
|
||||
@param value: Integer value.
|
||||
@param bits: The number of bits of the input value (defaults to 8).
|
||||
@return: Number of set bits.
|
||||
"""
|
||||
count = 0
|
||||
for _ in range(bits):
|
||||
if value & 0x01 == 0x01:
|
||||
count += 1
|
||||
value = value >> 1
|
||||
return count
|
||||
|
||||
|
||||
def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
|
||||
"""
|
||||
Create a map of tags and values from the given byte section.
|
||||
|
||||
@param controlByteCount: The number of control bytes.
|
||||
@param tagTable: The tag table.
|
||||
@param entryData: The data to process.
|
||||
@param startPos: The starting position in entryData.
|
||||
@param endPos: The end position in entryData or None if it is unknown.
|
||||
@return: Hashmap of tag and list of values.
|
||||
"""
|
||||
tags = []
|
||||
tagHashMap = {}
|
||||
controlByteIndex = 0
|
||||
dataStart = startPos + controlByteCount
|
||||
|
||||
for tag, valuesPerEntry, mask, endFlag in tagTable:
|
||||
if endFlag == 0x01:
|
||||
controlByteIndex += 1
|
||||
continue
|
||||
cbyte = ord(
|
||||
entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
|
||||
)
|
||||
if 0:
|
||||
logger.debug(
|
||||
"Control Byte Index %0x , Control Byte Value %0x"
|
||||
% (controlByteIndex, cbyte)
|
||||
)
|
||||
|
||||
value = (
|
||||
ord(
|
||||
entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
|
||||
)
|
||||
& mask
|
||||
)
|
||||
if value != 0:
|
||||
if value == mask:
|
||||
if countSetBits(mask) > 1:
|
||||
# If all bits of masked value are set and the mask has more than one bit, a variable width value
|
||||
# will follow after the control bytes which defines the length of bytes (NOT the value count!)
|
||||
# which will contain the corresponding variable width values.
|
||||
consumed, value = getVariableWidthValue(entryData, dataStart)
|
||||
dataStart += consumed
|
||||
tags.append((tag, None, value, valuesPerEntry))
|
||||
else:
|
||||
tags.append((tag, 1, None, valuesPerEntry))
|
||||
else:
|
||||
# Shift bits to get the masked value.
|
||||
while mask & 0x01 == 0:
|
||||
mask = mask >> 1
|
||||
value = value >> 1
|
||||
tags.append((tag, value, None, valuesPerEntry))
|
||||
for tag, valueCount, valueBytes, valuesPerEntry in tags:
|
||||
values = []
|
||||
if valueCount is not None:
|
||||
# Read valueCount * valuesPerEntry variable width values.
|
||||
for _ in range(valueCount):
|
||||
for _ in range(valuesPerEntry):
|
||||
consumed, data = getVariableWidthValue(entryData, dataStart)
|
||||
dataStart += consumed
|
||||
values.append(data)
|
||||
else:
|
||||
# Convert valueBytes to variable width values.
|
||||
totalConsumed = 0
|
||||
while totalConsumed < valueBytes:
|
||||
# Does this work for valuesPerEntry != 1?
|
||||
consumed, data = getVariableWidthValue(entryData, dataStart)
|
||||
dataStart += consumed
|
||||
totalConsumed += consumed
|
||||
values.append(data)
|
||||
if totalConsumed != valueBytes:
|
||||
logger.debug(
|
||||
"Error: Should consume %s bytes, but consumed %s"
|
||||
% (valueBytes, totalConsumed)
|
||||
)
|
||||
tagHashMap[tag] = values
|
||||
# Test that all bytes have been processed if endPos is given.
|
||||
if endPos is not None and dataStart != endPos:
|
||||
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
|
||||
for char in entryData[dataStart:endPos]:
|
||||
if bord(char) != 0:
|
||||
logger.debug(
|
||||
"Warning: There are unprocessed index bytes left: %s"
|
||||
% toHex(entryData[dataStart:endPos])
|
||||
)
|
||||
if 0:
|
||||
logger.debug("controlByteCount: %s" % controlByteCount)
|
||||
logger.debug("tagTable: %s" % tagTable)
|
||||
logger.debug("data: %s" % toHex(entryData[startPos:endPos]))
|
||||
logger.debug("tagHashMap: %s" % tagHashMap)
|
||||
break
|
||||
|
||||
return tagHashMap
|
||||
575
mobiparse/mobi/mobi_k8proc.py
Executable file
575
mobiparse/mobi/mobi_k8proc.py
Executable file
@@ -0,0 +1,575 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, bstr, utf8_str
|
||||
from loguru import logger
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
import os
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
import re
|
||||
|
||||
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||
# but u"" is not allowed for the pattern itself only b""
|
||||
|
||||
from .mobi_index import MobiIndex
|
||||
from .mobi_utils import fromBase32
|
||||
from .unipath import pathof
|
||||
|
||||
_guide_types = [
|
||||
b"cover",
|
||||
b"title-page",
|
||||
b"toc",
|
||||
b"index",
|
||||
b"glossary",
|
||||
b"acknowledgements",
|
||||
b"bibliography",
|
||||
b"colophon",
|
||||
b"copyright-page",
|
||||
b"dedication",
|
||||
b"epigraph",
|
||||
b"foreward",
|
||||
b"loi",
|
||||
b"lot",
|
||||
b"notes",
|
||||
b"preface",
|
||||
b"text",
|
||||
]
|
||||
|
||||
# locate beginning and ending positions of tag with specific aid attribute
|
||||
def locate_beg_end_of_tag(ml, aid):
|
||||
pattern = utf8_str(r"""<[^>]*\said\s*=\s*['"]%s['"][^>]*>""" % aid)
|
||||
aid_pattern = re.compile(pattern, re.IGNORECASE)
|
||||
for m in re.finditer(aid_pattern, ml):
|
||||
plt = m.start()
|
||||
pgt = ml.find(b">", plt + 1)
|
||||
return plt, pgt
|
||||
return 0, 0
|
||||
|
||||
|
||||
# iterate over all tags in block in reverse order, i.e. last ta to first tag
|
||||
def reverse_tag_iter(block):
|
||||
end = len(block)
|
||||
while True:
|
||||
pgt = block.rfind(b">", 0, end)
|
||||
if pgt == -1:
|
||||
break
|
||||
plt = block.rfind(b"<", 0, pgt)
|
||||
if plt == -1:
|
||||
break
|
||||
yield block[plt : pgt + 1]
|
||||
end = plt
|
||||
|
||||
|
||||
class K8Processor:
|
||||
def __init__(self, mh, sect, files, debug=False):
|
||||
self.sect = sect
|
||||
self.files = files
|
||||
self.mi = MobiIndex(sect)
|
||||
self.mh = mh
|
||||
self.skelidx = mh.skelidx
|
||||
self.fragidx = mh.fragidx
|
||||
self.guideidx = mh.guideidx
|
||||
self.fdst = mh.fdst
|
||||
self.flowmap = {}
|
||||
self.flows = None
|
||||
self.flowinfo = []
|
||||
self.parts = None
|
||||
self.partinfo = []
|
||||
self.linked_aids = set()
|
||||
self.fdsttbl = [0, 0xFFFFFFFF]
|
||||
self.DEBUG = debug
|
||||
|
||||
# read in and parse the FDST info which is very similar in format to the Palm DB section
|
||||
# parsing except it provides offsets into rawML file and not the Palm DB file
|
||||
# this is needed to split up the final css, svg, etc flow section
|
||||
# that can exist at the end of the rawML file
|
||||
if self.fdst != 0xFFFFFFFF:
|
||||
header = self.sect.loadSection(self.fdst)
|
||||
if header[0:4] == b"FDST":
|
||||
(num_sections,) = struct.unpack_from(b">L", header, 0x08)
|
||||
self.fdsttbl = struct.unpack_from(
|
||||
bstr(">%dL" % (num_sections * 2)), header, 12
|
||||
)[::2] + (mh.rawSize,)
|
||||
sect.setsectiondescription(self.fdst, "KF8 FDST INDX")
|
||||
if self.DEBUG:
|
||||
logger.debug("\nFDST Section Map: %d sections" % num_sections)
|
||||
for j in range(num_sections):
|
||||
logger.debug(
|
||||
"Section %d: 0x%08X - 0x%08X"
|
||||
% (j, self.fdsttbl[j], self.fdsttbl[j + 1])
|
||||
)
|
||||
else:
|
||||
logger.debug("\nError: K8 Mobi with Missing FDST info")
|
||||
|
||||
# read/process skeleton index info to create the skeleton table
|
||||
skeltbl = []
|
||||
if self.skelidx != 0xFFFFFFFF:
|
||||
# for i in range(2):
|
||||
# fname = 'skel%04d.dat' % i
|
||||
# data = self.sect.loadSection(self.skelidx + i)
|
||||
# with open(pathof(fname), 'wb') as f:
|
||||
# f.write(data)
|
||||
outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
|
||||
fileptr = 0
|
||||
for [text, tagMap] in outtbl:
|
||||
# file number, skeleton name, fragtbl record count, start position, length
|
||||
skeltbl.append(
|
||||
[fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]
|
||||
)
|
||||
fileptr += 1
|
||||
self.skeltbl = skeltbl
|
||||
if self.DEBUG:
|
||||
logger.debug("\nSkel Table: %d entries" % len(self.skeltbl))
|
||||
logger.debug(
|
||||
"table: filenum, skeleton name, frag tbl record count, start position, length"
|
||||
)
|
||||
for j in range(len(self.skeltbl)):
|
||||
logger.debug(self.skeltbl[j])
|
||||
|
||||
# read/process the fragment index to create the fragment table
|
||||
fragtbl = []
|
||||
if self.fragidx != 0xFFFFFFFF:
|
||||
# for i in range(3):
|
||||
# fname = 'frag%04d.dat' % i
|
||||
# data = self.sect.loadSection(self.fragidx + i)
|
||||
# with open(pathof(fname), 'wb') as f:
|
||||
# f.write(data)
|
||||
outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
|
||||
for [text, tagMap] in outtbl:
|
||||
# insert position, ctoc offset (aidtext), file number, sequence number, start position, length
|
||||
ctocoffset = tagMap[2][0]
|
||||
ctocdata = ctoc_text[ctocoffset]
|
||||
fragtbl.append(
|
||||
[
|
||||
int(text),
|
||||
ctocdata,
|
||||
tagMap[3][0],
|
||||
tagMap[4][0],
|
||||
tagMap[6][0],
|
||||
tagMap[6][1],
|
||||
]
|
||||
)
|
||||
self.fragtbl = fragtbl
|
||||
if self.DEBUG:
|
||||
logger.debug("\nFragment Table: %d entries" % len(self.fragtbl))
|
||||
logger.debug(
|
||||
"table: file position, link id text, file num, sequence number, start position, length"
|
||||
)
|
||||
for j in range(len(self.fragtbl)):
|
||||
logger.debug(self.fragtbl[j])
|
||||
|
||||
# read / process guide index for guide elements of opf
|
||||
guidetbl = []
|
||||
if self.guideidx != 0xFFFFFFFF:
|
||||
# for i in range(3):
|
||||
# fname = 'guide%04d.dat' % i
|
||||
# data = self.sect.loadSection(self.guideidx + i)
|
||||
# with open(pathof(fname), 'wb') as f:
|
||||
# f.write(data)
|
||||
outtbl, ctoc_text = self.mi.getIndexData(
|
||||
self.guideidx, "KF8 Guide elements)"
|
||||
)
|
||||
for [text, tagMap] in outtbl:
|
||||
# ref_type, ref_title, frag number
|
||||
ctocoffset = tagMap[1][0]
|
||||
ref_title = ctoc_text[ctocoffset]
|
||||
ref_type = text
|
||||
fileno = None
|
||||
if 3 in tagMap:
|
||||
fileno = tagMap[3][0]
|
||||
if 6 in tagMap:
|
||||
fileno = tagMap[6][0]
|
||||
guidetbl.append([ref_type, ref_title, fileno])
|
||||
self.guidetbl = guidetbl
|
||||
if self.DEBUG:
|
||||
logger.debug("\nGuide Table: %d entries" % len(self.guidetbl))
|
||||
logger.debug("table: ref_type, ref_title, fragtbl entry number")
|
||||
for j in range(len(self.guidetbl)):
|
||||
logger.debug(self.guidetbl[j])
|
||||
|
||||
def buildParts(self, rawML):
|
||||
# now split the rawML into its flow pieces
|
||||
self.flows = []
|
||||
for j in range(0, len(self.fdsttbl) - 1):
|
||||
start = self.fdsttbl[j]
|
||||
end = self.fdsttbl[j + 1]
|
||||
self.flows.append(rawML[start:end])
|
||||
|
||||
# the first piece represents the xhtml text
|
||||
text = self.flows[0]
|
||||
self.flows[0] = b""
|
||||
|
||||
# walk the <skeleton> and fragment tables to build original source xhtml files
|
||||
# *without* destroying any file position information needed for later href processing
|
||||
# and create final list of file separation start: stop points and etc in partinfo
|
||||
if self.DEBUG:
|
||||
logger.debug("\nRebuilding flow piece 0: the main body of the ebook")
|
||||
self.parts = []
|
||||
self.partinfo = []
|
||||
fragptr = 0
|
||||
baseptr = 0
|
||||
cnt = 0
|
||||
filename = "part%04d.xhtml" % cnt
|
||||
for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
|
||||
baseptr = skelpos + skellen
|
||||
skeleton = text[skelpos:baseptr]
|
||||
aidtext = "0"
|
||||
for i in range(fragcnt):
|
||||
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[
|
||||
fragptr
|
||||
]
|
||||
aidtext = idtext[12:-2]
|
||||
if i == 0:
|
||||
filename = "part%04d.xhtml" % filenum
|
||||
slice = text[baseptr : baseptr + length]
|
||||
insertpos = insertpos - skelpos
|
||||
head = skeleton[:insertpos]
|
||||
tail = skeleton[insertpos:]
|
||||
actual_inspos = insertpos
|
||||
if tail.find(b">") < tail.find(b"<") or head.rfind(b">") < head.rfind(
|
||||
b"<"
|
||||
):
|
||||
# There is an incomplete tag in either the head or tail.
|
||||
# This can happen for some badly formed KF8 files
|
||||
logger.debug(
|
||||
"The fragment table for %s has incorrect insert position. Calculating manually."
|
||||
% skelname
|
||||
)
|
||||
bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
|
||||
if bp != ep:
|
||||
actual_inspos = ep + 1 + startpos
|
||||
if insertpos != actual_inspos:
|
||||
print(
|
||||
"fixed corrupt fragment table insert position",
|
||||
insertpos + skelpos,
|
||||
actual_inspos + skelpos,
|
||||
)
|
||||
insertpos = actual_inspos
|
||||
self.fragtbl[fragptr][0] = actual_inspos + skelpos
|
||||
skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
|
||||
baseptr = baseptr + length
|
||||
fragptr += 1
|
||||
cnt += 1
|
||||
self.parts.append(skeleton)
|
||||
self.partinfo.append([skelnum, "Text", filename, skelpos, baseptr, aidtext])
|
||||
|
||||
assembled_text = b"".join(self.parts)
|
||||
if self.DEBUG:
|
||||
outassembled = os.path.join(self.files.k8dir, "assembled_text.dat")
|
||||
with open(pathof(outassembled), "wb") as f:
|
||||
f.write(assembled_text)
|
||||
|
||||
# The primary css style sheet is typically stored next followed by any
|
||||
# snippets of code that were previously inlined in the
|
||||
# original xhtml but have been stripped out and placed here.
|
||||
# This can include local CDATA snippets and and svg sections.
|
||||
|
||||
# The problem is that for most browsers and ereaders, you can not
|
||||
# use <img src="imageXXXX.svg" /> to import any svg image that itself
|
||||
# properly uses an <image/> tag to import some raster image - it
|
||||
# should work according to the spec but does not for almost all browsers
|
||||
# and ereaders and causes epub validation issues because those raster
|
||||
# images are in manifest but not in xhtml text - since they only
|
||||
# referenced from an svg image
|
||||
|
||||
# So we need to check the remaining flow pieces to see if they are css
|
||||
# or svg images. if svg images, we must check if they have an <image />
|
||||
# and if so inline them into the xhtml text pieces.
|
||||
|
||||
# there may be other sorts of pieces stored here but until we see one
|
||||
# in the wild to reverse engineer we won't be able to tell
|
||||
self.flowinfo.append([None, None, None, None])
|
||||
svg_tag_pattern = re.compile(br"""(<svg[^>]*>)""", re.IGNORECASE)
|
||||
image_tag_pattern = re.compile(br"""(<image[^>]*>)""", re.IGNORECASE)
|
||||
for j in range(1, len(self.flows)):
|
||||
flowpart = self.flows[j]
|
||||
nstr = "%04d" % j
|
||||
m = re.search(svg_tag_pattern, flowpart)
|
||||
if m is not None:
|
||||
# svg
|
||||
ptype = b"svg"
|
||||
start = m.start()
|
||||
m2 = re.search(image_tag_pattern, flowpart)
|
||||
if m2 is not None:
|
||||
pformat = b"inline"
|
||||
pdir = None
|
||||
fname = None
|
||||
# strip off anything before <svg if inlining
|
||||
flowpart = flowpart[start:]
|
||||
else:
|
||||
pformat = b"file"
|
||||
pdir = "Images"
|
||||
fname = "svgimg" + nstr + ".svg"
|
||||
else:
|
||||
# search for CDATA and if exists inline it
|
||||
if flowpart.find(b"[CDATA[") >= 0:
|
||||
ptype = b"css"
|
||||
flowpart = b'<style type="text/css">\n' + flowpart + b"\n</style>\n"
|
||||
pformat = b"inline"
|
||||
pdir = None
|
||||
fname = None
|
||||
else:
|
||||
# css - assume as standalone css file
|
||||
ptype = b"css"
|
||||
pformat = b"file"
|
||||
pdir = "Styles"
|
||||
fname = "style" + nstr + ".css"
|
||||
|
||||
self.flows[j] = flowpart
|
||||
self.flowinfo.append([ptype, pformat, pdir, fname])
|
||||
|
||||
if self.DEBUG:
|
||||
logger.debug("\nFlow Map: %d entries" % len(self.flowinfo))
|
||||
for fi in self.flowinfo:
|
||||
logger.debug(fi)
|
||||
logger.debug("\n")
|
||||
|
||||
logger.debug(
|
||||
"\nXHTML File Part Position Information: %d entries"
|
||||
% len(self.partinfo)
|
||||
)
|
||||
for pi in self.partinfo:
|
||||
logger.debug(pi)
|
||||
|
||||
if False: # self.Debug:
|
||||
# dump all of the locations of the aid tags used in TEXT
|
||||
# find id links only inside of tags
|
||||
# inside any < > pair find all "aid=' and return whatever is inside the quotes
|
||||
# [^>]* means match any amount of chars except for '>' char
|
||||
# [^'"] match any amount of chars except for the quote character
|
||||
# \s* means match any amount of whitespace
|
||||
logger.debug("\npositions of all aid= pieces")
|
||||
id_pattern = re.compile(
|
||||
br"""<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>""", re.IGNORECASE
|
||||
)
|
||||
for m in re.finditer(id_pattern, rawML):
|
||||
[filename, partnum, start, end] = self.getFileInfo(m.start())
|
||||
[seqnum, idtext] = self.getFragTblInfo(m.start())
|
||||
value = fromBase32(m.group(1))
|
||||
logger.debug(
|
||||
" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d"
|
||||
% (m.group(1), value, m.start(), partnum, start, end)
|
||||
)
|
||||
logger.debug(" %s fragtbl entry %d" % (idtext, seqnum))
|
||||
|
||||
return
|
||||
|
||||
# get information fragment table entry by pos
|
||||
def getFragTblInfo(self, pos):
|
||||
for j in range(len(self.fragtbl)):
|
||||
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
|
||||
if pos >= insertpos and pos < (insertpos + length):
|
||||
# why are these "in: and before: added here
|
||||
return seqnum, b"in: " + idtext
|
||||
if pos < insertpos:
|
||||
return seqnum, b"before: " + idtext
|
||||
return None, None
|
||||
|
||||
# get information about the part (file) that exists at pos in original rawML
|
||||
def getFileInfo(self, pos):
|
||||
for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
|
||||
if pos >= start and pos < end:
|
||||
return filename, partnum, start, end
|
||||
return None, None, None, None
|
||||
|
||||
# accessor functions to properly protect the internal structure
|
||||
def getNumberOfParts(self):
|
||||
return len(self.parts)
|
||||
|
||||
def getPart(self, i):
|
||||
if i >= 0 and i < len(self.parts):
|
||||
return self.parts[i]
|
||||
return None
|
||||
|
||||
def getPartInfo(self, i):
|
||||
if i >= 0 and i < len(self.partinfo):
|
||||
return self.partinfo[i]
|
||||
return None
|
||||
|
||||
def getNumberOfFlows(self):
|
||||
return len(self.flows)
|
||||
|
||||
def getFlow(self, i):
|
||||
# note flows[0] is empty - it was all of the original text
|
||||
if i > 0 and i < len(self.flows):
|
||||
return self.flows[i]
|
||||
return None
|
||||
|
||||
def getFlowInfo(self, i):
|
||||
# note flowinfo[0] is empty - it was all of the original text
|
||||
if i > 0 and i < len(self.flowinfo):
|
||||
return self.flowinfo[i]
|
||||
return None
|
||||
|
||||
def getIDTagByPosFid(self, posfid, offset):
|
||||
# first convert kindle:pos:fid and offset info to position in file
|
||||
# (fromBase32 can handle both string types on input)
|
||||
row = fromBase32(posfid)
|
||||
off = fromBase32(offset)
|
||||
[insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
|
||||
pos = insertpos + off
|
||||
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||
if fname is None:
|
||||
# pos does not exist
|
||||
# default to skeleton pos instead
|
||||
print(
|
||||
"Link To Position", pos, "does not exist, retargeting to top of target"
|
||||
)
|
||||
pos = self.skeltbl[filenum][3]
|
||||
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||
# an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
|
||||
# Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
|
||||
# some position information encoded into Base32 name.
|
||||
# so find the closest "id=" before position the file by actually searching in that file
|
||||
idtext = self.getIDTag(pos)
|
||||
return fname, idtext
|
||||
|
||||
def getIDTag(self, pos):
|
||||
# find the first tag with a named anchor (name or id attribute) before pos
|
||||
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||
if pn is None and skelpos is None:
|
||||
logger.debug("Error: getIDTag - no file contains %s" % pos)
|
||||
textblock = self.parts[pn]
|
||||
npos = pos - skelpos
|
||||
# if npos inside a tag then search all text before the its end of tag marker
|
||||
pgt = textblock.find(b">", npos)
|
||||
plt = textblock.find(b"<", npos)
|
||||
if plt == npos or pgt < plt:
|
||||
npos = pgt + 1
|
||||
# find id and name attributes only inside of tags
|
||||
# use a reverse tag search since that is faster
|
||||
# inside any < > pair find "id=" and "name=" attributes return it
|
||||
# [^>]* means match any amount of chars except for '>' char
|
||||
# [^'"] match any amount of chars except for the quote character
|
||||
# \s* means match any amount of whitespace
|
||||
textblock = textblock[0:npos]
|
||||
id_pattern = re.compile(
|
||||
br"""<[^>]*\sid\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
|
||||
)
|
||||
name_pattern = re.compile(
|
||||
br"""<[^>]*\sname\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
|
||||
)
|
||||
aid_pattern = re.compile(br"""<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]""")
|
||||
for tag in reverse_tag_iter(textblock):
|
||||
# any ids in the body should default to top of file
|
||||
if tag[0:6] == b"<body ":
|
||||
return b""
|
||||
if tag[0:6] != b"<meta ":
|
||||
m = id_pattern.match(tag) or name_pattern.match(tag)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
m = aid_pattern.match(tag)
|
||||
if m is not None:
|
||||
self.linked_aids.add(m.group(1))
|
||||
return b"aid-" + m.group(1)
|
||||
return b""
|
||||
|
||||
# do we need to do deep copying
|
||||
def setParts(self, parts):
|
||||
assert len(parts) == len(self.parts)
|
||||
for i in range(len(parts)):
|
||||
self.parts[i] = parts[i]
|
||||
|
||||
# do we need to do deep copying
|
||||
def setFlows(self, flows):
|
||||
assert len(flows) == len(self.flows)
|
||||
for i in range(len(flows)):
|
||||
self.flows[i] = flows[i]
|
||||
|
||||
# get information about the part (file) that exists at pos in original rawML
|
||||
def getSkelInfo(self, pos):
|
||||
for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
|
||||
if pos >= start and pos < end:
|
||||
return [partnum, pdir, filename, start, end, aidtext]
|
||||
return [None, None, None, None, None, None]
|
||||
|
||||
# fileno is actually a reference into fragtbl (a fragment)
|
||||
def getGuideText(self):
|
||||
guidetext = b""
|
||||
for [ref_type, ref_title, fileno] in self.guidetbl:
|
||||
if ref_type == b"thumbimagestandard":
|
||||
continue
|
||||
if ref_type not in _guide_types and not ref_type.startswith(b"other."):
|
||||
if ref_type == b"start":
|
||||
ref_type = b"text"
|
||||
else:
|
||||
ref_type = b"other." + ref_type
|
||||
[pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
|
||||
[pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
|
||||
idtext = self.getIDTag(pos)
|
||||
linktgt = filename.encode("utf-8")
|
||||
if idtext != b"":
|
||||
linktgt += b"#" + idtext
|
||||
guidetext += (
|
||||
b'<reference type="'
|
||||
+ ref_type
|
||||
+ b'" title="'
|
||||
+ ref_title
|
||||
+ b'" href="'
|
||||
+ utf8_str(pdir)
|
||||
+ b"/"
|
||||
+ linktgt
|
||||
+ b'" />\n'
|
||||
)
|
||||
# opf is encoded utf-8 so must convert any titles properly
|
||||
guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
|
||||
return guidetext
|
||||
|
||||
def getPageIDTag(self, pos):
|
||||
# find the first tag with a named anchor (name or id attribute) before pos
|
||||
# but page map offsets need to little more leeway so if the offset points
|
||||
# into a tag look for the next ending tag "/>" or "</" and start your search from there.
|
||||
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||
if pn is None and skelpos is None:
|
||||
logger.debug("Error: getIDTag - no file contains %s" % pos)
|
||||
textblock = self.parts[pn]
|
||||
npos = pos - skelpos
|
||||
# if npos inside a tag then search all text before next ending tag
|
||||
pgt = textblock.find(b">", npos)
|
||||
plt = textblock.find(b"<", npos)
|
||||
if plt == npos or pgt < plt:
|
||||
# we are in a tag
|
||||
# so find first ending tag
|
||||
pend1 = textblock.find(b"/>", npos)
|
||||
pend2 = textblock.find(b"</", npos)
|
||||
if pend1 != -1 and pend2 != -1:
|
||||
pend = min(pend1, pend2)
|
||||
else:
|
||||
pend = max(pend1, pend2)
|
||||
if pend != -1:
|
||||
npos = pend
|
||||
else:
|
||||
npos = pgt + 1
|
||||
# find id and name attributes only inside of tags
|
||||
# use a reverse tag search since that is faster
|
||||
# inside any < > pair find "id=" and "name=" attributes return it
|
||||
# [^>]* means match any amount of chars except for '>' char
|
||||
# [^'"] match any amount of chars except for the quote character
|
||||
# \s* means match any amount of whitespace
|
||||
textblock = textblock[0:npos]
|
||||
id_pattern = re.compile(
|
||||
br"""<[^>]*\sid\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
|
||||
)
|
||||
name_pattern = re.compile(
|
||||
br"""<[^>]*\sname\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
|
||||
)
|
||||
for tag in reverse_tag_iter(textblock):
|
||||
# any ids in the body should default to top of file
|
||||
if tag[0:6] == b"<body ":
|
||||
return b""
|
||||
if tag[0:6] != b"<meta ":
|
||||
m = id_pattern.match(tag) or name_pattern.match(tag)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
return b""
|
||||
290
mobiparse/mobi/mobi_k8resc.py
Executable file
290
mobiparse/mobi/mobi_k8resc.py
Executable file
@@ -0,0 +1,290 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7.
|
||||
""" set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
|
||||
|
||||
if DEBUG_USE_ORDERED_DICTIONARY:
|
||||
from collections import OrderedDict as dict_
|
||||
else:
|
||||
dict_ = dict
|
||||
|
||||
from .compatibility_utils import unicode_str
|
||||
from loguru import logger
|
||||
|
||||
from .mobi_utils import fromBase32
|
||||
|
||||
_OPF_PARENT_TAGS = [
|
||||
"xml",
|
||||
"package",
|
||||
"metadata",
|
||||
"dc-metadata",
|
||||
"x-metadata",
|
||||
"manifest",
|
||||
"spine",
|
||||
"tours",
|
||||
"guide",
|
||||
]
|
||||
|
||||
|
||||
class K8RESCProcessor(object):
|
||||
def __init__(self, data, debug=False):
|
||||
self._debug = debug
|
||||
self.resc = None
|
||||
self.opos = 0
|
||||
self.extrameta = []
|
||||
self.cover_name = None
|
||||
self.spine_idrefs = {}
|
||||
self.spine_order = []
|
||||
self.spine_pageattributes = {}
|
||||
self.spine_ppd = None
|
||||
# need3 indicate the book has fields which require epub3.
|
||||
# but the estimation of the source epub version from the fields is difficult.
|
||||
self.need3 = False
|
||||
self.package_ver = None
|
||||
self.extra_metadata = []
|
||||
self.refines_metadata = []
|
||||
self.extra_attributes = []
|
||||
# get header
|
||||
start_pos = data.find(b"<")
|
||||
self.resc_header = data[:start_pos]
|
||||
# get resc data length
|
||||
start = self.resc_header.find(b"=") + 1
|
||||
end = self.resc_header.find(b"&", start)
|
||||
resc_size = 0
|
||||
if end > 0:
|
||||
resc_size = fromBase32(self.resc_header[start:end])
|
||||
resc_rawbytes = len(data) - start_pos
|
||||
if resc_rawbytes == resc_size:
|
||||
self.resc_length = resc_size
|
||||
else:
|
||||
# Most RESC has a nul string at its tail but some do not.
|
||||
end_pos = data.find(b"\x00", start_pos)
|
||||
if end_pos < 0:
|
||||
self.resc_length = resc_rawbytes
|
||||
else:
|
||||
self.resc_length = end_pos - start_pos
|
||||
if self.resc_length != resc_size:
|
||||
logger.debug(
|
||||
"Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(
|
||||
self.resc_length, resc_size
|
||||
)
|
||||
)
|
||||
# now parse RESC after converting it to unicode from utf-8
|
||||
self.resc = unicode_str(data[start_pos : start_pos + self.resc_length])
|
||||
self.parseData()
|
||||
|
||||
def prepend_to_spine(self, key, idref, linear, properties):
|
||||
self.spine_order = [key] + self.spine_order
|
||||
self.spine_idrefs[key] = idref
|
||||
attributes = {}
|
||||
if linear is not None:
|
||||
attributes["linear"] = linear
|
||||
if properties is not None:
|
||||
attributes["properties"] = properties
|
||||
self.spine_pageattributes[key] = attributes
|
||||
|
||||
# RESC tag iterator
|
||||
def resc_tag_iter(self):
|
||||
tcontent = last_tattr = None
|
||||
prefix = [""]
|
||||
while True:
|
||||
text, tag = self.parseresc()
|
||||
if text is None and tag is None:
|
||||
break
|
||||
if text is not None:
|
||||
tcontent = text.rstrip(" \r\n")
|
||||
else: # we have a tag
|
||||
ttype, tname, tattr = self.parsetag(tag)
|
||||
if ttype == "begin":
|
||||
tcontent = None
|
||||
prefix.append(tname + ".")
|
||||
if tname in _OPF_PARENT_TAGS:
|
||||
yield "".join(prefix), tname, tattr, tcontent
|
||||
else:
|
||||
last_tattr = tattr
|
||||
else: # single or end
|
||||
if ttype == "end":
|
||||
prefix.pop()
|
||||
tattr = last_tattr
|
||||
last_tattr = None
|
||||
if tname in _OPF_PARENT_TAGS:
|
||||
tname += "-end"
|
||||
yield "".join(prefix), tname, tattr, tcontent
|
||||
tcontent = None
|
||||
|
||||
# now parse the RESC to extract spine and extra metadata info
|
||||
def parseData(self):
|
||||
for prefix, tname, tattr, tcontent in self.resc_tag_iter():
|
||||
if self._debug:
|
||||
logger.debug(
|
||||
" Parsing RESC: %s %s %s %s" % (prefix, tname, tattr, tcontent)
|
||||
)
|
||||
if tname == "package":
|
||||
self.package_ver = tattr.get("version", "2.0")
|
||||
package_prefix = tattr.get("prefix", "")
|
||||
if self.package_ver.startswith("3") or package_prefix.startswith(
|
||||
"rendition"
|
||||
):
|
||||
self.need3 = True
|
||||
if tname == "spine":
|
||||
self.spine_ppd = tattr.get("page-progession-direction", None)
|
||||
if self.spine_ppd is not None and self.spine_ppd == "rtl":
|
||||
self.need3 = True
|
||||
if tname == "itemref":
|
||||
skelid = tattr.pop("skelid", None)
|
||||
if skelid is None and len(self.spine_order) == 0:
|
||||
# assume it was removed initial coverpage
|
||||
skelid = "coverpage"
|
||||
tattr["linear"] = "no"
|
||||
self.spine_order.append(skelid)
|
||||
idref = tattr.pop("idref", None)
|
||||
if idref is not None:
|
||||
idref = "x_" + idref
|
||||
self.spine_idrefs[skelid] = idref
|
||||
if "id" in tattr:
|
||||
del tattr["id"]
|
||||
# tattr["id"] = 'x_' + tattr["id"]
|
||||
if "properties" in tattr:
|
||||
self.need3 = True
|
||||
self.spine_pageattributes[skelid] = tattr
|
||||
if tname == "meta" or tname.startswith("dc:"):
|
||||
if "refines" in tattr or "property" in tattr:
|
||||
self.need3 = True
|
||||
if tattr.get("name", "") == "cover":
|
||||
cover_name = tattr.get("content", None)
|
||||
if cover_name is not None:
|
||||
cover_name = "x_" + cover_name
|
||||
self.cover_name = cover_name
|
||||
else:
|
||||
self.extrameta.append([tname, tattr, tcontent])
|
||||
|
||||
# parse and return either leading text or the next tag
|
||||
def parseresc(self):
|
||||
p = self.opos
|
||||
if p >= len(self.resc):
|
||||
return None, None
|
||||
if self.resc[p] != "<":
|
||||
res = self.resc.find("<", p)
|
||||
if res == -1:
|
||||
res = len(self.resc)
|
||||
self.opos = res
|
||||
return self.resc[p:res], None
|
||||
# handle comment as a special case
|
||||
if self.resc[p : p + 4] == "<!--":
|
||||
te = self.resc.find("-->", p + 1)
|
||||
if te != -1:
|
||||
te = te + 2
|
||||
else:
|
||||
te = self.resc.find(">", p + 1)
|
||||
ntb = self.resc.find("<", p + 1)
|
||||
if ntb != -1 and ntb < te:
|
||||
self.opos = ntb
|
||||
return self.resc[p:ntb], None
|
||||
self.opos = te + 1
|
||||
return None, self.resc[p : te + 1]
|
||||
|
||||
# parses tag to identify: [tname, ttype, tattr]
|
||||
# tname: tag name
|
||||
# ttype: tag type ('begin', 'end' or 'single');
|
||||
# tattr: dictionary of tag atributes
|
||||
def parsetag(self, s):
|
||||
p = 1
|
||||
tname = None
|
||||
ttype = None
|
||||
tattr = dict_()
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
if s[p : p + 1] == "/":
|
||||
ttype = "end"
|
||||
p += 1
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
b = p
|
||||
while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"):
|
||||
p += 1
|
||||
tname = s[b:p].lower()
|
||||
# some special cases
|
||||
if tname == "?xml":
|
||||
tname = "xml"
|
||||
if tname == "!--":
|
||||
ttype = "single"
|
||||
comment = s[p:-3].strip()
|
||||
tattr["comment"] = comment
|
||||
if ttype is None:
|
||||
# parse any attributes of begin or single tags
|
||||
while s.find("=", p) != -1:
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
b = p
|
||||
while s[p : p + 1] != "=":
|
||||
p += 1
|
||||
aname = s[b:p].lower()
|
||||
aname = aname.rstrip(" ")
|
||||
p += 1
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
if s[p : p + 1] in ('"', "'"):
|
||||
p = p + 1
|
||||
b = p
|
||||
while s[p : p + 1] not in ('"', "'"):
|
||||
p += 1
|
||||
val = s[b:p]
|
||||
p += 1
|
||||
else:
|
||||
b = p
|
||||
while s[p : p + 1] not in (">", "/", " "):
|
||||
p += 1
|
||||
val = s[b:p]
|
||||
tattr[aname] = val
|
||||
if ttype is None:
|
||||
ttype = "begin"
|
||||
if s.find("/", p) >= 0:
|
||||
ttype = "single"
|
||||
return ttype, tname, tattr
|
||||
|
||||
def taginfo_toxml(self, taginfo):
|
||||
res = []
|
||||
tname, tattr, tcontent = taginfo
|
||||
res.append("<" + tname)
|
||||
if tattr is not None:
|
||||
for key in tattr:
|
||||
res.append(" " + key + '="' + tattr[key] + '"')
|
||||
if tcontent is not None:
|
||||
res.append(">" + tcontent + "</" + tname + ">\n")
|
||||
else:
|
||||
res.append("/>\n")
|
||||
return "".join(res)
|
||||
|
||||
def hasSpine(self):
|
||||
return len(self.spine_order) > 0
|
||||
|
||||
def needEPUB3(self):
|
||||
return self.need3
|
||||
|
||||
def hasRefines(self):
|
||||
for [tname, tattr, tcontent] in self.extrameta:
|
||||
if "refines" in tattr:
|
||||
return True
|
||||
return False
|
||||
|
||||
def createMetadata(self, epubver):
|
||||
for taginfo in self.extrameta:
|
||||
tname, tattr, tcontent = taginfo
|
||||
if "refines" in tattr:
|
||||
if epubver == "F" and "property" in tattr:
|
||||
attr = ' id="%s" opf:%s="%s"\n' % (
|
||||
tattr["refines"],
|
||||
tattr["property"],
|
||||
tcontent,
|
||||
)
|
||||
self.extra_attributes.append(attr)
|
||||
else:
|
||||
tag = self.taginfo_toxml(taginfo)
|
||||
self.refines_metadata.append(tag)
|
||||
else:
|
||||
tag = self.taginfo_toxml(taginfo)
|
||||
self.extra_metadata.append(tag)
|
||||
202
mobiparse/mobi/mobi_nav.py
Executable file
202
mobiparse/mobi/mobi_nav.py
Executable file
@@ -0,0 +1,202 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import unicode_str
|
||||
import os
|
||||
from .unipath import pathof
|
||||
from loguru import logger
|
||||
|
||||
import re
|
||||
|
||||
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||
# but u"" is not allowed for the pattern itself only b""
|
||||
|
||||
DEBUG_NAV = False
|
||||
|
||||
FORCE_DEFAULT_TITLE = False
|
||||
""" Set to True to force to use the default title. """
|
||||
|
||||
NAVIGATION_FINENAME = "nav.xhtml"
|
||||
""" The name for the navigation document. """
|
||||
|
||||
DEFAULT_TITLE = "Navigation"
|
||||
""" The default title for the navigation document. """
|
||||
|
||||
|
||||
class NAVProcessor(object):
|
||||
def __init__(self, files):
|
||||
self.files = files
|
||||
self.navname = NAVIGATION_FINENAME
|
||||
|
||||
def buildLandmarks(self, guidetext):
|
||||
header = ""
|
||||
header += ' <nav epub:type="landmarks" id="landmarks" hidden="">\n'
|
||||
header += " <h2>Guide</h2>\n"
|
||||
header += " <ol>\n"
|
||||
element = ' <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n'
|
||||
footer = ""
|
||||
footer += " </ol>\n"
|
||||
footer += " </nav>\n"
|
||||
|
||||
type_map = {
|
||||
"cover": "cover",
|
||||
"title-page": "title-page",
|
||||
# ?: 'frontmatter',
|
||||
"text": "bodymatter",
|
||||
# ?: 'backmatter',
|
||||
"toc": "toc",
|
||||
"loi": "loi",
|
||||
"lot": "lot",
|
||||
"preface": "preface",
|
||||
"bibliography": "bibliography",
|
||||
"index": "index",
|
||||
"glossary": "glossary",
|
||||
"acknowledgements": "acknowledgements",
|
||||
"colophon": None,
|
||||
"copyright-page": None,
|
||||
"dedication": None,
|
||||
"epigraph": None,
|
||||
"foreword": None,
|
||||
"notes": None,
|
||||
}
|
||||
|
||||
re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
|
||||
re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
|
||||
re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
|
||||
dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace("\\", "/")
|
||||
|
||||
data = ""
|
||||
references = re.findall(r"<reference\s+.*?>", unicode_str(guidetext), re.I)
|
||||
for reference in references:
|
||||
mo_type = re_type.search(reference)
|
||||
mo_title = re_title.search(reference)
|
||||
mo_link = re_link.search(reference)
|
||||
if mo_type is not None:
|
||||
type_ = type_map.get(mo_type.group(1), None)
|
||||
else:
|
||||
type_ = None
|
||||
if mo_title is not None:
|
||||
title = mo_title.group(1)
|
||||
else:
|
||||
title = None
|
||||
if mo_link is not None:
|
||||
link = mo_link.group(1)
|
||||
else:
|
||||
link = None
|
||||
|
||||
if type_ is not None and title is not None and link is not None:
|
||||
link = os.path.relpath(link, dir_).replace("\\", "/")
|
||||
data += element.format(type_, link, title)
|
||||
if len(data) > 0:
|
||||
return header + data + footer
|
||||
else:
|
||||
return ""
|
||||
|
||||
def buildTOC(self, indx_data):
|
||||
header = ""
|
||||
header += ' <nav epub:type="toc" id="toc">\n'
|
||||
header += " <h1>Table of contents</h1>\n"
|
||||
footer = " </nav>\n"
|
||||
|
||||
# recursive part
|
||||
def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
|
||||
if start > len(indx_data) or end > len(indx_data):
|
||||
logger.debug(
|
||||
"Warning (in buildTOC): missing INDX child entries",
|
||||
start,
|
||||
end,
|
||||
len(indx_data),
|
||||
)
|
||||
return ""
|
||||
if DEBUG_NAV:
|
||||
logger.debug(
|
||||
"recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end)
|
||||
)
|
||||
xhtml = ""
|
||||
if start <= 0:
|
||||
start = 0
|
||||
if end <= 0:
|
||||
end = len(indx_data)
|
||||
if lvl > max_lvl:
|
||||
max_lvl = lvl
|
||||
|
||||
indent1 = " " * (2 + lvl * 2)
|
||||
indent2 = " " * (3 + lvl * 2)
|
||||
xhtml += indent1 + "<ol>\n"
|
||||
for i in range(start, end):
|
||||
e = indx_data[i]
|
||||
htmlfile = e["filename"]
|
||||
desttag = e["idtag"]
|
||||
text = e["text"]
|
||||
if not e["hlvl"] == lvl:
|
||||
continue
|
||||
num += 1
|
||||
if desttag == "":
|
||||
link = htmlfile
|
||||
else:
|
||||
link = "{:s}#{:s}".format(htmlfile, desttag)
|
||||
xhtml += indent2 + "<li>"
|
||||
entry = '<a href="{:}">{:s}</a>'.format(link, text)
|
||||
xhtml += entry
|
||||
# recurs
|
||||
if e["child1"] >= 0:
|
||||
xhtml += "\n"
|
||||
xhtmlrec, max_lvl, num = recursINDX(
|
||||
max_lvl, num, lvl + 1, e["child1"], e["childn"] + 1
|
||||
)
|
||||
xhtml += xhtmlrec
|
||||
xhtml += indent2
|
||||
# close entry
|
||||
xhtml += "</li>\n"
|
||||
xhtml += indent1 + "</ol>\n"
|
||||
return xhtml, max_lvl, num
|
||||
|
||||
data, max_lvl, num = recursINDX()
|
||||
if not len(indx_data) == num:
|
||||
logger.debug(
|
||||
"Warning (in buildTOC): different number of entries in NCX",
|
||||
len(indx_data),
|
||||
num,
|
||||
)
|
||||
return header + data + footer
|
||||
|
||||
def buildNAV(self, ncx_data, guidetext, title, lang):
|
||||
logger.debug("Building Navigation Document.")
|
||||
if FORCE_DEFAULT_TITLE:
|
||||
title = DEFAULT_TITLE
|
||||
nav_header = ""
|
||||
nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
|
||||
nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"'
|
||||
nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"'
|
||||
nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang)
|
||||
nav_header += "<head>\n<title>{:s}</title>\n".format(title)
|
||||
nav_header += '<meta charset="UTF-8" />\n'
|
||||
nav_header += '<style type="text/css">\n'
|
||||
nav_header += "nav#landmarks { display:none; }\n"
|
||||
nav_header += "</style>\n</head>\n<body>\n"
|
||||
nav_footer = "</body>\n</html>\n"
|
||||
|
||||
landmarks = self.buildLandmarks(guidetext)
|
||||
toc = self.buildTOC(ncx_data)
|
||||
|
||||
data = nav_header
|
||||
data += landmarks
|
||||
data += toc
|
||||
data += nav_footer
|
||||
return data
|
||||
|
||||
def getNAVName(self):
|
||||
return self.navname
|
||||
|
||||
def writeNAV(self, ncx_data, guidetext, metadata):
|
||||
# build the xhtml
|
||||
# logger.debug("Write Navigation Document.")
|
||||
xhtml = self.buildNAV(
|
||||
ncx_data, guidetext, metadata.get("Title")[0], metadata.get("Language")[0]
|
||||
)
|
||||
fname = os.path.join(self.files.k8text, self.navname)
|
||||
with open(pathof(fname), "wb") as f:
|
||||
f.write(xhtml.encode("utf-8"))
|
||||
132
mobiparse/mobi/mobi_ncx.py
Executable file
132
mobiparse/mobi/mobi_ncx.py
Executable file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
import os
|
||||
from .unipath import pathof
|
||||
from loguru import logger
|
||||
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||
# but u"" is not allowed for the pattern itself only b""
|
||||
|
||||
'''
|
||||
NCX (Navigation Control for XML applications) is a generalized navigation definition DTD for application
|
||||
to Digital Talking Books, eBooks, and general web content models.
|
||||
This DTD is an XML application that layers navigation functionality on top of SMIL 2.0 content.
|
||||
The NCX defines a navigation path/model that may be applied upon existing publications,
|
||||
without modification of the existing publication source, so long as the navigation targets within
|
||||
the source publication can be directly referenced via a URI.
|
||||
|
||||
http://www.daisy.org/z3986/2005/ncx-2005-1.dtd
|
||||
'''
|
||||
|
||||
from .mobi_utils import toBase32
|
||||
from .mobi_index import MobiIndex
|
||||
|
||||
DEBUG_NCX = False
|
||||
|
||||
class ncxExtract:
|
||||
def __init__(self, mh):
|
||||
self.mh = mh
|
||||
self.sect = self.mh.sect
|
||||
self.isNCX = False
|
||||
self.mi = MobiIndex(self.sect)
|
||||
self.ncxidx = self.mh.ncxidx
|
||||
self.indx_data = None
|
||||
|
||||
def parseNCX(self):
|
||||
indx_data = []
|
||||
tag_fieldname_map = {
|
||||
1: ["pos", 0],
|
||||
2: ["len", 0],
|
||||
3: ["noffs", 0],
|
||||
4: ["hlvl", 0],
|
||||
5: ["koffs", 0],
|
||||
6: ["pos_fid", 0],
|
||||
21: ["parent", 0],
|
||||
22: ["child1", 0],
|
||||
23: ["childn", 0],
|
||||
}
|
||||
if self.ncxidx != 0xFFFFFFFF:
|
||||
outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
|
||||
if DEBUG_NCX:
|
||||
logger.debug("ctoc_text {}".format(ctoc_text))
|
||||
logger.debug("outtbl {}".format(outtbl))
|
||||
num = 0
|
||||
for [text, tagMap] in outtbl:
|
||||
tmp = {
|
||||
"name": text.decode("utf-8"),
|
||||
"pos": -1,
|
||||
"len": 0,
|
||||
"noffs": -1,
|
||||
"text": "Unknown Text",
|
||||
"hlvl": -1,
|
||||
"kind": "Unknown Kind",
|
||||
"pos_fid": None,
|
||||
"parent": -1,
|
||||
"child1": -1,
|
||||
"childn": -1,
|
||||
"num": num,
|
||||
}
|
||||
for tag in tag_fieldname_map:
|
||||
[fieldname, i] = tag_fieldname_map[tag]
|
||||
if tag in tagMap:
|
||||
fieldvalue = tagMap[tag][i]
|
||||
if tag == 6:
|
||||
pos_fid = toBase32(fieldvalue, 4).decode("utf-8")
|
||||
fieldvalue2 = tagMap[tag][i + 1]
|
||||
pos_off = toBase32(fieldvalue2, 10).decode("utf-8")
|
||||
fieldvalue = "kindle:pos:fid:%s:off:%s" % (pos_fid, pos_off)
|
||||
tmp[fieldname] = fieldvalue
|
||||
if tag == 3:
|
||||
toctext = ctoc_text.get(fieldvalue, "Unknown Text")
|
||||
toctext = toctext.decode(self.mh.codec)
|
||||
tmp["text"] = toctext
|
||||
if tag == 5:
|
||||
kindtext = ctoc_text.get(fieldvalue, "Unknown Kind")
|
||||
kindtext = kindtext.decode(self.mh.codec)
|
||||
tmp["kind"] = kindtext
|
||||
indx_data.append(tmp)
|
||||
|
||||
# CGDBG
|
||||
'''
|
||||
record number: 3
|
||||
name: 03
|
||||
position 461377 length: 465358 => position/150 = real page number
|
||||
text: 第二章 青铜时代——单机游戏
|
||||
kind: Unknown Kind
|
||||
heading level: 0 => level of section
|
||||
parent: -1 => record number of previous level of section
|
||||
first child: 15 last child: 26 => range of record number of next level section
|
||||
pos_fid is kindle:pos:fid:0023:off:0000000000
|
||||
'''
|
||||
if DEBUG_NCX:
|
||||
print("record number: ", num)
|
||||
print(
|
||||
"name: ", tmp["name"],
|
||||
)
|
||||
print("position", tmp["pos"], " length: ", tmp["len"])
|
||||
print("text: ", tmp["text"])
|
||||
print("kind: ", tmp["kind"])
|
||||
print("heading level: ", tmp["hlvl"])
|
||||
print("parent:", tmp["parent"])
|
||||
print(
|
||||
"first child: ", tmp["child1"], " last child: ", tmp["childn"]
|
||||
)
|
||||
print("pos_fid is ", tmp["pos_fid"])
|
||||
print("\n\n")
|
||||
num += 1
|
||||
self.indx_data = indx_data
|
||||
|
||||
# {'name': '00', 'pos': 167, 'len': 24798, 'noffs': 0, 'text': '版权信息', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 0}
|
||||
# {'name': '0B', 'pos': 67932, 'len': 3274, 'noffs': 236, 'text': '8.希罗多德', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 11}
|
||||
#print('indx_data {}'.format(json.dumps(indx_data, indent=4, sort_keys=True, ensure_ascii=False)))
|
||||
|
||||
return indx_data
|
||||
|
||||
828
mobiparse/mobi/mobi_opf.py
Executable file
828
mobiparse/mobi/mobi_opf.py
Executable file
@@ -0,0 +1,828 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import unicode_str, unescapeit
|
||||
from .compatibility_utils import lzip
|
||||
from loguru import logger
|
||||
|
||||
from .unipath import pathof
|
||||
|
||||
from xml.sax.saxutils import escape as xmlescape
|
||||
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
# In EPUB3, NCX and <guide> MAY exist in OPF, although the NCX is superseded
|
||||
# by the Navigation Document and the <guide> is deprecated. Currently, EPUB3_WITH_NCX
|
||||
# and EPUB3_WITH_GUIDE are set to True due to compatibility with epub2 reading systems.
|
||||
# They might be change to set to False in the future.
|
||||
|
||||
EPUB3_WITH_NCX = True # Do not set to False except for debug.
|
||||
""" Set to True to create a toc.ncx when converting to epub3. """
|
||||
|
||||
EPUB3_WITH_GUIDE = True # Do not set to False except for debug.
|
||||
""" Set to True to create a guide element in an opf when converting to epub3. """
|
||||
|
||||
EPUB_OPF = "content.opf"
|
||||
""" The name for the OPF of EPUB. """
|
||||
|
||||
TOC_NCX = "toc.ncx"
|
||||
""" The name for the TOC of EPUB2. """
|
||||
|
||||
NAVIGATION_DOCUMENT = "nav.xhtml"
|
||||
""" The name for the navigation document of EPUB3. """
|
||||
|
||||
BEGIN_INFO_ONLY = "<!-- BEGIN INFORMATION ONLY "
|
||||
""" The comment to indicate the beginning of metadata which will be ignored by kindlegen. """
|
||||
|
||||
END_INFO_ONLY = "END INFORMATION ONLY -->"
|
||||
""" The comment to indicate the end of metadata which will be ignored by kindlegen. """
|
||||
|
||||
EXTH_TITLE_FURIGANA = "Title-Pronunciation"
|
||||
""" The name for Title Furigana(similar to file-as) set by KDP. """
|
||||
|
||||
EXTH_CREATOR_FURIGANA = "Author-Pronunciation"
|
||||
""" The name for Creator Furigana(similar to file-as) set by KDP. """
|
||||
|
||||
EXTH_PUBLISHER_FURIGANA = "Publisher-Pronunciation"
|
||||
""" The name for Publisher Furigana(similar to file-as) set by KDP. """
|
||||
|
||||
EXTRA_ENTITIES = {'"': """, "'": "'"}
|
||||
|
||||
|
||||
class OPFProcessor(object):
|
||||
def __init__(
|
||||
self,
|
||||
files,
|
||||
metadata,
|
||||
fileinfo,
|
||||
rscnames,
|
||||
hasNCX,
|
||||
mh,
|
||||
usedmap,
|
||||
pagemapxml="",
|
||||
guidetext="",
|
||||
k8resc=None,
|
||||
epubver="2",
|
||||
):
|
||||
self.files = files
|
||||
self.metadata = metadata
|
||||
self.fileinfo = fileinfo
|
||||
self.rscnames = rscnames
|
||||
self.has_ncx = hasNCX
|
||||
self.codec = mh.codec
|
||||
self.isK8 = mh.isK8()
|
||||
self.printReplica = mh.isPrintReplica()
|
||||
self.guidetext = unicode_str(guidetext)
|
||||
self.used = usedmap
|
||||
self.k8resc = k8resc
|
||||
self.covername = None
|
||||
self.cover_id = "cover_img"
|
||||
if self.k8resc is not None and self.k8resc.cover_name is not None:
|
||||
# update cover id info from RESC if available
|
||||
self.cover_id = self.k8resc.cover_name
|
||||
# Create a unique urn uuid
|
||||
self.BookId = unicode_str(str(uuid.uuid4()))
|
||||
self.pagemap = pagemapxml
|
||||
|
||||
self.ncxname = None
|
||||
self.navname = None
|
||||
|
||||
# page-progression-direction is only set in spine
|
||||
self.page_progression_direction = metadata.pop(
|
||||
"page-progression-direction", [None]
|
||||
)[0]
|
||||
if "rl" in metadata.get("primary-writing-mode", [""])[0]:
|
||||
self.page_progression_direction = "rtl"
|
||||
self.epubver = epubver # the epub version set by user
|
||||
self.target_epubver = (
|
||||
epubver # the epub vertion set by user or detected automatically
|
||||
)
|
||||
if self.epubver == "A":
|
||||
self.target_epubver = self.autodetectEPUBVersion()
|
||||
elif self.epubver == "F":
|
||||
self.target_epubver = "2"
|
||||
elif self.epubver != "2" and self.epubver != "3":
|
||||
self.target_epubver = "2"
|
||||
|
||||
# id for rifine attributes
|
||||
self.title_id = {}
|
||||
self.creator_id = {}
|
||||
self.publisher_id = {}
|
||||
# extra attributes
|
||||
self.title_attrib = {}
|
||||
self.creator_attrib = {}
|
||||
self.publisher_attrib = {}
|
||||
self.extra_attributes = [] # for force epub2 option
|
||||
# Create epub3 metadata from EXTH.
|
||||
self.exth_solved_refines_metadata = []
|
||||
self.exth_refines_metadata = []
|
||||
self.exth_fixedlayout_metadata = []
|
||||
|
||||
self.defineRefinesID()
|
||||
self.processRefinesMetadata()
|
||||
if self.k8resc is not None:
|
||||
# Create metadata in RESC section.
|
||||
self.k8resc.createMetadata(epubver)
|
||||
if self.target_epubver == "3":
|
||||
self.createMetadataForFixedlayout()
|
||||
|
||||
def escapeit(self, sval, EXTRAS=None):
|
||||
# note, xmlescape and unescape do not work with utf-8 bytestrings
|
||||
sval = unicode_str(sval)
|
||||
if EXTRAS:
|
||||
res = xmlescape(unescapeit(sval), EXTRAS)
|
||||
else:
|
||||
res = xmlescape(unescapeit(sval))
|
||||
return res
|
||||
|
||||
def createMetaTag(self, data, property, content, refid=""):
|
||||
refines = ""
|
||||
if refid:
|
||||
refines = ' refines="#%s"' % refid
|
||||
data.append('<meta property="%s"%s>%s</meta>\n' % (property, refines, content))
|
||||
|
||||
def buildOPFMetadata(self, start_tag, has_obfuscated_fonts=False):
|
||||
# convert from EXTH metadata format to target epub version metadata
|
||||
# epub 3 will ignore <meta name="xxxx" content="yyyy" /> style metatags
|
||||
# but allows them to be present for backwards compatibility
|
||||
# instead the new format is
|
||||
# <meta property="xxxx" id="iiii" ... > property_value</meta>
|
||||
# and DCMES elements such as:
|
||||
# <dc:blah id="iiii">value</dc:blah>
|
||||
|
||||
metadata = self.metadata
|
||||
k8resc = self.k8resc
|
||||
|
||||
META_TAGS = [
|
||||
"Drm Server Id",
|
||||
"Drm Commerce Id",
|
||||
"Drm Ebookbase Book Id",
|
||||
"ASIN",
|
||||
"ThumbOffset",
|
||||
"Fake Cover",
|
||||
"Creator Software",
|
||||
"Creator Major Version",
|
||||
"Creator Minor Version",
|
||||
"Creator Build Number",
|
||||
"Watermark",
|
||||
"Clipping Limit",
|
||||
"Publisher Limit",
|
||||
"Text to Speech Disabled",
|
||||
"CDE Type",
|
||||
"Updated Title",
|
||||
"Font Signature (hex)",
|
||||
"Tamper Proof Keys (hex)",
|
||||
]
|
||||
|
||||
# def handleTag(data, metadata, key, tag, ids={}):
|
||||
def handleTag(data, metadata, key, tag, attrib={}):
|
||||
"""Format metadata values.
|
||||
|
||||
@param data: List of formatted metadata entries.
|
||||
@param metadata: The metadata dictionary.
|
||||
@param key: The key of the metadata value to handle.
|
||||
@param tag: The opf tag corresponds to the metadata value.
|
||||
###@param ids: The ids in tags for refines property of epub3.
|
||||
@param attrib: The extra attibute for refines or opf prefixs.
|
||||
"""
|
||||
if key in metadata:
|
||||
for i, value in enumerate(metadata[key]):
|
||||
closingTag = tag.split(" ")[0]
|
||||
res = "<%s%s>%s</%s>\n" % (
|
||||
tag,
|
||||
attrib.get(i, ""),
|
||||
self.escapeit(value),
|
||||
closingTag,
|
||||
)
|
||||
data.append(res)
|
||||
del metadata[key]
|
||||
|
||||
# these are allowed but ignored by epub3
|
||||
def handleMetaPairs(data, metadata, key, name):
|
||||
if key in metadata:
|
||||
for value in metadata[key]:
|
||||
res = '<meta name="%s" content="%s" />\n' % (
|
||||
name,
|
||||
self.escapeit(value, EXTRA_ENTITIES),
|
||||
)
|
||||
data.append(res)
|
||||
del metadata[key]
|
||||
|
||||
data = []
|
||||
data.append(start_tag + "\n")
|
||||
# Handle standard metadata
|
||||
if "Title" in metadata:
|
||||
handleTag(data, metadata, "Title", "dc:title", self.title_attrib)
|
||||
else:
|
||||
data.append("<dc:title>Untitled</dc:title>\n")
|
||||
handleTag(data, metadata, "Language", "dc:language")
|
||||
if "UniqueID" in metadata:
|
||||
handleTag(data, metadata, "UniqueID", 'dc:identifier id="uid"')
|
||||
else:
|
||||
# No unique ID in original, give it a generic one.
|
||||
data.append('<dc:identifier id="uid">0</dc:identifier>\n')
|
||||
|
||||
if self.target_epubver == "3":
|
||||
# epub version 3 minimal metadata requires a dcterms:modifed date tag
|
||||
self.createMetaTag(
|
||||
data,
|
||||
"dcterms:modified",
|
||||
datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
)
|
||||
|
||||
if self.isK8 and has_obfuscated_fonts:
|
||||
# Use the random generated urn:uuid so obuscated fonts work.
|
||||
# It doesn't need to be _THE_ unique identifier to work as a key
|
||||
# for obfuscated fonts in Sigil, ADE and calibre. Its just has
|
||||
# to use the opf:scheme="UUID" and have the urn:uuid: prefix.
|
||||
if self.target_epubver == "3":
|
||||
data.append(
|
||||
"<dc:identifier>urn:uuid:" + self.BookId + "</dc:identifier>\n"
|
||||
)
|
||||
else:
|
||||
data.append(
|
||||
'<dc:identifier opf:scheme="UUID">urn:uuid:'
|
||||
+ self.BookId
|
||||
+ "</dc:identifier>\n"
|
||||
)
|
||||
|
||||
handleTag(data, metadata, "Creator", "dc:creator", self.creator_attrib)
|
||||
handleTag(data, metadata, "Contributor", "dc:contributor")
|
||||
handleTag(data, metadata, "Publisher", "dc:publisher", self.publisher_attrib)
|
||||
handleTag(data, metadata, "Source", "dc:source")
|
||||
handleTag(data, metadata, "Type", "dc:type")
|
||||
if self.target_epubver == "3":
|
||||
if "ISBN" in metadata:
|
||||
for i, value in enumerate(metadata["ISBN"]):
|
||||
res = (
|
||||
"<dc:identifier>urn:isbn:%s</dc:identifier>\n"
|
||||
% self.escapeit(value)
|
||||
)
|
||||
data.append(res)
|
||||
else:
|
||||
handleTag(data, metadata, "ISBN", 'dc:identifier opf:scheme="ISBN"')
|
||||
if "Subject" in metadata:
|
||||
if "SubjectCode" in metadata:
|
||||
codeList = metadata["SubjectCode"]
|
||||
del metadata["SubjectCode"]
|
||||
else:
|
||||
codeList = None
|
||||
for i in range(len(metadata["Subject"])):
|
||||
if codeList and i < len(codeList):
|
||||
data.append('<dc:subject BASICCode="' + codeList[i] + '">')
|
||||
else:
|
||||
data.append("<dc:subject>")
|
||||
data.append(self.escapeit(metadata["Subject"][i]) + "</dc:subject>\n")
|
||||
del metadata["Subject"]
|
||||
handleTag(data, metadata, "Description", "dc:description")
|
||||
if self.target_epubver == "3":
|
||||
if "Published" in metadata:
|
||||
for i, value in enumerate(metadata["Published"]):
|
||||
res = "<dc:date>%s</dc:date>\n" % self.escapeit(value)
|
||||
data.append(res)
|
||||
else:
|
||||
handleTag(data, metadata, "Published", 'dc:date opf:event="publication"')
|
||||
handleTag(data, metadata, "Rights", "dc:rights")
|
||||
|
||||
if self.epubver == "F":
|
||||
if self.extra_attributes or k8resc is not None and k8resc.extra_attributes:
|
||||
data.append(
|
||||
"<!-- THE FOLLOWINGS ARE REQUIRED TO INSERT INTO <dc:xxx> MANUALLY\n"
|
||||
)
|
||||
if self.extra_attributes:
|
||||
data += self.extra_attributes
|
||||
if k8resc is not None and k8resc.extra_attributes:
|
||||
data += k8resc.extra_attributes
|
||||
data.append("-->\n")
|
||||
else:
|
||||
# Append refines metadata.
|
||||
if self.exth_solved_refines_metadata:
|
||||
data.append("<!-- Refines MetaData from EXTH -->\n")
|
||||
data += self.exth_solved_refines_metadata
|
||||
if (
|
||||
self.exth_refines_metadata
|
||||
or k8resc is not None
|
||||
and k8resc.refines_metadata
|
||||
):
|
||||
data.append("<!-- THE FOLLOWINGS ARE REQUIRED TO EDIT IDS MANUALLY\n")
|
||||
if self.exth_refines_metadata:
|
||||
data += self.exth_refines_metadata
|
||||
if k8resc is not None and k8resc.refines_metadata:
|
||||
data += k8resc.refines_metadata
|
||||
data.append("-->\n")
|
||||
|
||||
# Append metadata in RESC section.
|
||||
if k8resc is not None and k8resc.extra_metadata:
|
||||
data.append("<!-- Extra MetaData from RESC\n")
|
||||
data += k8resc.extra_metadata
|
||||
data.append("-->\n")
|
||||
|
||||
if "CoverOffset" in metadata:
|
||||
imageNumber = int(metadata["CoverOffset"][0])
|
||||
self.covername = self.rscnames[imageNumber]
|
||||
if self.covername is None:
|
||||
logger.debug(
|
||||
"Error: Cover image %s was not recognized as a valid image"
|
||||
% imageNumber
|
||||
)
|
||||
else:
|
||||
# <meta name="cover"> is obsoleted in EPUB3, but kindlegen v2.9 requires it.
|
||||
data.append('<meta name="cover" content="' + self.cover_id + '" />\n')
|
||||
self.used[self.covername] = "used"
|
||||
del metadata["CoverOffset"]
|
||||
|
||||
handleMetaPairs(data, metadata, "Codec", "output encoding")
|
||||
# handle kindlegen specifc tags
|
||||
handleTag(data, metadata, "DictInLanguage", "DictionaryInLanguage")
|
||||
handleTag(data, metadata, "DictOutLanguage", "DictionaryOutLanguage")
|
||||
handleMetaPairs(data, metadata, "RegionMagnification", "RegionMagnification")
|
||||
handleMetaPairs(data, metadata, "book-type", "book-type")
|
||||
handleMetaPairs(data, metadata, "zero-gutter", "zero-gutter")
|
||||
handleMetaPairs(data, metadata, "zero-margin", "zero-margin")
|
||||
handleMetaPairs(data, metadata, "primary-writing-mode", "primary-writing-mode")
|
||||
handleMetaPairs(data, metadata, "fixed-layout", "fixed-layout")
|
||||
handleMetaPairs(data, metadata, "orientation-lock", "orientation-lock")
|
||||
handleMetaPairs(data, metadata, "original-resolution", "original-resolution")
|
||||
|
||||
# these are not allowed in epub2 or 3 so convert them to meta name content pairs
|
||||
# perhaps these could better be mapped into the dcterms namespace instead
|
||||
handleMetaPairs(data, metadata, "Review", "review")
|
||||
handleMetaPairs(data, metadata, "Imprint", "imprint")
|
||||
handleMetaPairs(data, metadata, "Adult", "adult")
|
||||
handleMetaPairs(data, metadata, "DictShortName", "DictionaryVeryShortName")
|
||||
|
||||
# these are needed by kobo books upon submission but not sure if legal metadata in epub2 or epub3
|
||||
if "Price" in metadata and "Currency" in metadata:
|
||||
priceList = metadata["Price"]
|
||||
currencyList = metadata["Currency"]
|
||||
if len(priceList) != len(currencyList):
|
||||
logger.debug("Error: found %s price entries, but %s currency entries.")
|
||||
else:
|
||||
for i in range(len(priceList)):
|
||||
data.append(
|
||||
'<SRP Currency="'
|
||||
+ currencyList[i]
|
||||
+ '">'
|
||||
+ priceList[i]
|
||||
+ "</SRP>\n"
|
||||
)
|
||||
del metadata["Price"]
|
||||
del metadata["Currency"]
|
||||
|
||||
if self.target_epubver == "3":
|
||||
# Append metadata for EPUB3.
|
||||
if self.exth_fixedlayout_metadata:
|
||||
data.append("<!-- EPUB3 MedaData converted from EXTH -->\n")
|
||||
data += self.exth_fixedlayout_metadata
|
||||
|
||||
# all that remains is extra EXTH info we will store inside a comment inside meta name/content pairs
|
||||
# so it can not impact anything and will be automatically stripped out if found again in a RESC section
|
||||
data.append(BEGIN_INFO_ONLY + "\n")
|
||||
if "ThumbOffset" in metadata:
|
||||
imageNumber = int(metadata["ThumbOffset"][0])
|
||||
imageName = self.rscnames[imageNumber]
|
||||
if imageName is None:
|
||||
logger.debug(
|
||||
"Error: Cover Thumbnail image %s was not recognized as a valid image"
|
||||
% imageNumber
|
||||
)
|
||||
else:
|
||||
data.append(
|
||||
'<meta name="Cover ThumbNail Image" content="'
|
||||
+ "Images/"
|
||||
+ imageName
|
||||
+ '" />\n'
|
||||
)
|
||||
# self.used[imageName] = 'used' # thumbnail image is always generated by Kindlegen, so don't include in manifest
|
||||
self.used[imageName] = "not used"
|
||||
del metadata["ThumbOffset"]
|
||||
for metaName in META_TAGS:
|
||||
if metaName in metadata:
|
||||
for value in metadata[metaName]:
|
||||
data.append(
|
||||
'<meta name="'
|
||||
+ metaName
|
||||
+ '" content="'
|
||||
+ self.escapeit(value, EXTRA_ENTITIES)
|
||||
+ '" />\n'
|
||||
)
|
||||
del metadata[metaName]
|
||||
for key in list(metadata.keys()):
|
||||
for value in metadata[key]:
|
||||
data.append(
|
||||
'<meta name="'
|
||||
+ key
|
||||
+ '" content="'
|
||||
+ self.escapeit(value, EXTRA_ENTITIES)
|
||||
+ '" />\n'
|
||||
)
|
||||
del metadata[key]
|
||||
data.append(END_INFO_ONLY + "\n")
|
||||
data.append("</metadata>\n")
|
||||
return data
|
||||
|
||||
def buildOPFManifest(self, ncxname, navname=None):
|
||||
# buildManifest for mobi7, azw4, epub2 and epub3.
|
||||
k8resc = self.k8resc
|
||||
cover_id = self.cover_id
|
||||
hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
|
||||
self.ncxname = ncxname
|
||||
self.navname = navname
|
||||
|
||||
data = []
|
||||
data.append("<manifest>\n")
|
||||
media_map = {
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".png": "image/png",
|
||||
".gif": "image/gif",
|
||||
".svg": "image/svg+xml",
|
||||
".xhtml": "application/xhtml+xml",
|
||||
".html": "text/html", # for mobi7
|
||||
".pdf": "application/pdf", # for azw4(print replica textbook)
|
||||
".ttf": "application/x-font-ttf",
|
||||
".otf": "application/x-font-opentype", # replaced?
|
||||
".css": "text/css",
|
||||
# '.html' : 'text/x-oeb1-document', # for mobi7
|
||||
# '.otf' : 'application/vnd.ms-opentype', # [OpenType] OpenType fonts
|
||||
# '.woff' : 'application/font-woff', # [WOFF] WOFF fonts
|
||||
# '.smil' : 'application/smil+xml', # [MediaOverlays301] EPUB Media Overlay documents
|
||||
# '.pls' : 'application/pls+xml', # [PLS] Text-to-Speech (TTS) Pronunciation lexicons
|
||||
# '.mp3' : 'audio/mpeg',
|
||||
# '.mp4' : 'video/mp4',
|
||||
# '.js' : 'text/javascript', # not supported in K8
|
||||
}
|
||||
spinerefs = []
|
||||
|
||||
idcnt = 0
|
||||
for [key, dir, fname] in self.fileinfo:
|
||||
name, ext = os.path.splitext(fname)
|
||||
ext = ext.lower()
|
||||
media = media_map.get(ext)
|
||||
ref = "item%d" % idcnt
|
||||
if hasK8RescSpine:
|
||||
if key is not None and key in k8resc.spine_idrefs:
|
||||
ref = k8resc.spine_idrefs[key]
|
||||
properties = ""
|
||||
if dir != "":
|
||||
fpath = dir + "/" + fname
|
||||
else:
|
||||
fpath = fname
|
||||
data.append(
|
||||
'<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(
|
||||
ref, media, fpath, properties
|
||||
)
|
||||
)
|
||||
|
||||
if ext in [".xhtml", ".html"]:
|
||||
spinerefs.append(ref)
|
||||
idcnt += 1
|
||||
|
||||
for fname in self.rscnames:
|
||||
if fname is not None:
|
||||
if self.used.get(fname, "not used") == "not used":
|
||||
continue
|
||||
name, ext = os.path.splitext(fname)
|
||||
ext = ext.lower()
|
||||
media = media_map.get(ext, ext[1:])
|
||||
properties = ""
|
||||
if fname == self.covername:
|
||||
ref = cover_id
|
||||
if self.target_epubver == "3":
|
||||
properties = 'properties="cover-image"'
|
||||
else:
|
||||
ref = "item%d" % idcnt
|
||||
if ext == ".ttf" or ext == ".otf":
|
||||
if self.isK8: # fonts are only used in Mobi 8
|
||||
fpath = "Fonts/" + fname
|
||||
data.append(
|
||||
'<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(
|
||||
ref, media, fpath, properties
|
||||
)
|
||||
)
|
||||
else:
|
||||
fpath = "Images/" + fname
|
||||
data.append(
|
||||
'<item id="{0:}" media-type="{1:}" href="{2:}" {3:}/>\n'.format(
|
||||
ref, media, fpath, properties
|
||||
)
|
||||
)
|
||||
idcnt += 1
|
||||
|
||||
if self.target_epubver == "3" and navname is not None:
|
||||
data.append(
|
||||
'<item id="nav" media-type="application/xhtml+xml" href="Text/'
|
||||
+ navname
|
||||
+ '" properties="nav"/>\n'
|
||||
)
|
||||
if self.has_ncx and ncxname is not None:
|
||||
data.append(
|
||||
'<item id="ncx" media-type="application/x-dtbncx+xml" href="'
|
||||
+ ncxname
|
||||
+ '" />\n'
|
||||
)
|
||||
if self.pagemap != "":
|
||||
data.append(
|
||||
'<item id="map" media-type="application/oebs-page-map+xml" href="page-map.xml" />\n'
|
||||
)
|
||||
data.append("</manifest>\n")
|
||||
return [data, spinerefs]
|
||||
|
||||
def buildOPFSpine(self, spinerefs, isNCX):
|
||||
# build spine
|
||||
k8resc = self.k8resc
|
||||
hasK8RescSpine = k8resc is not None and k8resc.hasSpine()
|
||||
data = []
|
||||
ppd = ""
|
||||
if self.isK8 and self.page_progression_direction is not None:
|
||||
ppd = ' page-progression-direction="{:s}"'.format(
|
||||
self.page_progression_direction
|
||||
)
|
||||
ncx = ""
|
||||
if isNCX:
|
||||
ncx = ' toc="ncx"'
|
||||
map = ""
|
||||
if self.pagemap != "":
|
||||
map = ' page-map="map"'
|
||||
if self.epubver == "F":
|
||||
if ppd:
|
||||
ppd = "<!--" + ppd + " -->"
|
||||
spine_start_tag = "<spine{1:s}{2:s}>{0:s}\n".format(ppd, map, ncx)
|
||||
else:
|
||||
spine_start_tag = "<spine{0:s}{1:s}{2:s}>\n".format(ppd, map, ncx)
|
||||
data.append(spine_start_tag)
|
||||
|
||||
if hasK8RescSpine:
|
||||
for key in k8resc.spine_order:
|
||||
idref = k8resc.spine_idrefs[key]
|
||||
attribs = k8resc.spine_pageattributes[key]
|
||||
tag = '<itemref idref="%s"' % idref
|
||||
for aname, val in list(attribs.items()):
|
||||
if self.epubver == "F" and aname == "properties":
|
||||
continue
|
||||
if val is not None:
|
||||
tag += ' %s="%s"' % (aname, val)
|
||||
tag += "/>"
|
||||
if self.epubver == "F" and "properties" in attribs:
|
||||
val = attribs["properties"]
|
||||
if val is not None:
|
||||
tag += '<!-- properties="%s" -->' % val
|
||||
tag += "\n"
|
||||
data.append(tag)
|
||||
else:
|
||||
start = 0
|
||||
# special case the created coverpage if need be
|
||||
[key, dir, fname] = self.fileinfo[0]
|
||||
if key is not None and key == "coverpage":
|
||||
entry = spinerefs[start]
|
||||
data.append('<itemref idref="%s" linear="no"/>\n' % entry)
|
||||
start += 1
|
||||
for entry in spinerefs[start:]:
|
||||
data.append('<itemref idref="' + entry + '"/>\n')
|
||||
data.append("</spine>\n")
|
||||
return data
|
||||
|
||||
def buildMobi7OPF(self):
|
||||
# Build an OPF for mobi7 and azw4.
|
||||
logger.debug("Building an opf for mobi7/azw4.")
|
||||
data = []
|
||||
data.append('<?xml version="1.0" encoding="utf-8"?>\n')
|
||||
data.append(
|
||||
'<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n'
|
||||
)
|
||||
metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
|
||||
opf_metadata = self.buildOPFMetadata(metadata_tag)
|
||||
data += opf_metadata
|
||||
if self.has_ncx:
|
||||
# ncxname = self.files.getInputFileBasename() + '.ncx'
|
||||
ncxname = "toc.ncx"
|
||||
else:
|
||||
ncxname = None
|
||||
[opf_manifest, spinerefs] = self.buildOPFManifest(ncxname)
|
||||
data += opf_manifest
|
||||
opf_spine = self.buildOPFSpine(spinerefs, self.has_ncx)
|
||||
data += opf_spine
|
||||
data.append("<tours>\n</tours>\n")
|
||||
if not self.printReplica:
|
||||
guide = "<guide>\n" + self.guidetext + "</guide>\n"
|
||||
data.append(guide)
|
||||
data.append("</package>\n")
|
||||
return "".join(data)
|
||||
|
||||
def buildEPUBOPF(self, has_obfuscated_fonts=False):
|
||||
logger.debug(
|
||||
"Building an opf for mobi8 using epub version: %s" % self.target_epubver
|
||||
)
|
||||
if self.target_epubver == "2":
|
||||
has_ncx = self.has_ncx
|
||||
has_guide = True
|
||||
ncxname = None
|
||||
ncxname = TOC_NCX
|
||||
navname = None
|
||||
package = '<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="uid">\n'
|
||||
tours = "<tours>\n</tours>\n"
|
||||
metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">'
|
||||
else:
|
||||
has_ncx = EPUB3_WITH_NCX
|
||||
has_guide = EPUB3_WITH_GUIDE
|
||||
ncxname = None
|
||||
if has_ncx:
|
||||
ncxname = TOC_NCX
|
||||
navname = NAVIGATION_DOCUMENT
|
||||
package = '<package version="3.0" xmlns="http://www.idpf.org/2007/opf" prefix="rendition: http://www.idpf.org/vocab/rendition/#" unique-identifier="uid">\n'
|
||||
tours = ""
|
||||
metadata_tag = '<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">'
|
||||
|
||||
data = []
|
||||
data.append('<?xml version="1.0" encoding="utf-8"?>\n')
|
||||
data.append(package)
|
||||
opf_metadata = self.buildOPFMetadata(metadata_tag, has_obfuscated_fonts)
|
||||
data += opf_metadata
|
||||
[opf_manifest, spinerefs] = self.buildOPFManifest(ncxname, navname)
|
||||
data += opf_manifest
|
||||
opf_spine = self.buildOPFSpine(spinerefs, has_ncx)
|
||||
data += opf_spine
|
||||
data.append(tours)
|
||||
if has_guide:
|
||||
guide = "<guide>\n" + self.guidetext + "</guide>\n"
|
||||
data.append(guide)
|
||||
data.append("</package>\n")
|
||||
return "".join(data)
|
||||
|
||||
def writeOPF(self, has_obfuscated_fonts=False):
|
||||
if self.isK8:
|
||||
data = self.buildEPUBOPF(has_obfuscated_fonts)
|
||||
outopf = os.path.join(self.files.k8oebps, EPUB_OPF)
|
||||
with open(pathof(outopf), "wb") as f:
|
||||
f.write(data.encode("utf-8"))
|
||||
return self.BookId
|
||||
else:
|
||||
data = self.buildMobi7OPF()
|
||||
outopf = os.path.join(self.files.mobi7dir, "content.opf")
|
||||
with open(pathof(outopf), "wb") as f:
|
||||
f.write(data.encode("utf-8"))
|
||||
return 0
|
||||
|
||||
def getBookId(self):
|
||||
return self.BookId
|
||||
|
||||
def getNCXName(self):
|
||||
return self.ncxname
|
||||
|
||||
def getNAVName(self):
|
||||
return self.navname
|
||||
|
||||
def getEPUBVersion(self):
|
||||
return self.target_epubver
|
||||
|
||||
def hasNCX(self):
|
||||
return self.ncxname is not None and self.has_ncx
|
||||
|
||||
def hasNAV(self):
|
||||
return self.navname is not None
|
||||
|
||||
def autodetectEPUBVersion(self):
|
||||
# Determine EPUB version from metadata and RESC.
|
||||
metadata = self.metadata
|
||||
k8resc = self.k8resc
|
||||
epubver = "2"
|
||||
if "true" == metadata.get("fixed-layout", [""])[0].lower():
|
||||
epubver = "3"
|
||||
elif metadata.get("orientation-lock", [""])[0].lower() in [
|
||||
"portrait",
|
||||
"landscape",
|
||||
]:
|
||||
epubver = "3"
|
||||
elif self.page_progression_direction == "rtl":
|
||||
epubver = "3"
|
||||
elif EXTH_TITLE_FURIGANA in metadata:
|
||||
epubver = "3"
|
||||
elif EXTH_CREATOR_FURIGANA in metadata:
|
||||
epubver = "3"
|
||||
elif EXTH_PUBLISHER_FURIGANA in metadata:
|
||||
epubver = "3"
|
||||
elif k8resc is not None and k8resc.needEPUB3():
|
||||
epubver = "3"
|
||||
return epubver
|
||||
|
||||
def defineRefinesID(self):
|
||||
# the following EXTH are set by KDP.
|
||||
# 'Title_Furigana_(508)'
|
||||
# 'Creator_Furigana_(517)',
|
||||
# 'Publisher_Furigana_(522)'
|
||||
# It is difficult to find correspondence between Title, Creator, Publisher
|
||||
# and EXTH 508,512, 522 if they have more than two values since KDP seems not preserve the oders of EXTH 508,512 and 522.
|
||||
# It is also difficult to find correspondence between them and tags which have refine attributes in RESC.
|
||||
# So editing manually is required.
|
||||
metadata = self.metadata
|
||||
|
||||
needRefinesId = False
|
||||
if self.k8resc is not None:
|
||||
needRefinesId = self.k8resc.hasRefines()
|
||||
# Create id for rifine attributes
|
||||
if (needRefinesId or EXTH_TITLE_FURIGANA in metadata) and "Title" in metadata:
|
||||
for i in range(len(metadata.get("Title"))):
|
||||
self.title_id[i] = "title%02d" % (i + 1)
|
||||
|
||||
if (
|
||||
needRefinesId or EXTH_CREATOR_FURIGANA in metadata
|
||||
) and "Creator" in metadata:
|
||||
for i in range(len(metadata.get("Creator"))):
|
||||
self.creator_id[i] = "creator%02d" % (i + 1)
|
||||
|
||||
if (
|
||||
needRefinesId or EXTH_PUBLISHER_FURIGANA in metadata
|
||||
) and "Publisher" in metadata:
|
||||
for i in range(len(metadata.get("Publisher"))):
|
||||
self.publisher_id[i] = "publisher%02d" % (i + 1)
|
||||
|
||||
def processRefinesMetadata(self):
|
||||
# create refines metadata defined in epub3 or convert refines property to opf: attribues for epub2.
|
||||
metadata = self.metadata
|
||||
|
||||
refines_list = [
|
||||
[EXTH_TITLE_FURIGANA, self.title_id, self.title_attrib, "title00"],
|
||||
[EXTH_CREATOR_FURIGANA, self.creator_id, self.creator_attrib, "creator00"],
|
||||
[
|
||||
EXTH_PUBLISHER_FURIGANA,
|
||||
self.publisher_id,
|
||||
self.publisher_attrib,
|
||||
"publisher00",
|
||||
],
|
||||
]
|
||||
|
||||
create_refines_metadata = False
|
||||
for EXTH in lzip(*refines_list)[0]:
|
||||
if EXTH in metadata:
|
||||
create_refines_metadata = True
|
||||
break
|
||||
if create_refines_metadata:
|
||||
for [EXTH, id, attrib, defaultid] in refines_list:
|
||||
if self.target_epubver == "3":
|
||||
for i, value in list(id.items()):
|
||||
attrib[i] = ' id="%s"' % value
|
||||
|
||||
if EXTH in metadata:
|
||||
if len(metadata[EXTH]) == 1 and len(id) == 1:
|
||||
self.createMetaTag(
|
||||
self.exth_solved_refines_metadata,
|
||||
"file-as",
|
||||
metadata[EXTH][0],
|
||||
id[0],
|
||||
)
|
||||
else:
|
||||
for i, value in enumerate(metadata[EXTH]):
|
||||
self.createMetaTag(
|
||||
self.exth_refines_metadata,
|
||||
"file-as",
|
||||
value,
|
||||
id.get(i, defaultid),
|
||||
)
|
||||
else:
|
||||
if EXTH in metadata:
|
||||
if len(metadata[EXTH]) == 1 and len(id) == 1:
|
||||
attr = ' opf:file-as="%s"' % metadata[EXTH][0]
|
||||
attrib[0] = attr
|
||||
else:
|
||||
for i, value in enumerate(metadata[EXTH]):
|
||||
attr = ' id="#%s" opf:file-as="%s"\n' % (
|
||||
id.get(i, defaultid),
|
||||
value,
|
||||
)
|
||||
self.extra_attributes.append(attr)
|
||||
|
||||
def createMetadataForFixedlayout(self):
|
||||
# convert fixed layout to epub3 format if needed.
|
||||
metadata = self.metadata
|
||||
|
||||
if "fixed-layout" in metadata:
|
||||
fixedlayout = metadata["fixed-layout"][0]
|
||||
content = {"true": "pre-paginated"}.get(fixedlayout.lower(), "reflowable")
|
||||
self.createMetaTag(
|
||||
self.exth_fixedlayout_metadata, "rendition:layout", content
|
||||
)
|
||||
|
||||
if "orientation-lock" in metadata:
|
||||
content = metadata["orientation-lock"][0].lower()
|
||||
if content == "portrait" or content == "landscape":
|
||||
self.createMetaTag(
|
||||
self.exth_fixedlayout_metadata, "rendition:orientation", content
|
||||
)
|
||||
|
||||
# according to epub3 spec about correspondence with Amazon
|
||||
# if 'original-resolution' is provided it needs to be converted to
|
||||
# meta viewport property tag stored in the <head></head> of **each**
|
||||
# xhtml page - so this tag would need to be handled by editing each part
|
||||
# before reaching this routine
|
||||
# we need to add support for this to the k8html routine
|
||||
# if 'original-resolution' in metadata.keys():
|
||||
# resolution = metadata['original-resolution'][0].lower()
|
||||
# width, height = resolution.split('x')
|
||||
# if width.isdigit() and int(width) > 0 and height.isdigit() and int(height) > 0:
|
||||
# viewport = 'width=%s, height=%s' % (width, height)
|
||||
# self.createMetaTag(self.exth_fixedlayout_metadata, 'rendition:viewport', viewport)
|
||||
185
mobiparse/mobi/mobi_pagemap.py
Executable file
185
mobiparse/mobi/mobi_pagemap.py
Executable file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, unicode_str
|
||||
from loguru import logger
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
import re
|
||||
|
||||
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||
# but u"" is not allowed for the pattern itself only b""
|
||||
|
||||
|
||||
_TABLE = [
|
||||
("m", 1000),
|
||||
("cm", 900),
|
||||
("d", 500),
|
||||
("cd", 400),
|
||||
("c", 100),
|
||||
("xc", 90),
|
||||
("l", 50),
|
||||
("xl", 40),
|
||||
("x", 10),
|
||||
("ix", 9),
|
||||
("v", 5),
|
||||
("iv", 4),
|
||||
("i", 1),
|
||||
]
|
||||
|
||||
|
||||
def int_to_roman(i):
|
||||
parts = []
|
||||
num = i
|
||||
for letter, value in _TABLE:
|
||||
while value <= num:
|
||||
num -= value
|
||||
parts.append(letter)
|
||||
return "".join(parts)
|
||||
|
||||
|
||||
def roman_to_int(s):
|
||||
result = 0
|
||||
rnstr = s
|
||||
for letter, value in _TABLE:
|
||||
while rnstr.startswith(letter):
|
||||
result += value
|
||||
rnstr = rnstr[len(letter) :]
|
||||
return result
|
||||
|
||||
|
||||
_pattern = r"""\(([^\)]*)\)"""
|
||||
_tup_pattern = re.compile(_pattern, re.IGNORECASE)
|
||||
|
||||
|
||||
def _parseNames(numpages, data):
|
||||
data = unicode_str(data)
|
||||
pagenames = []
|
||||
pageMap = ""
|
||||
for i in range(numpages):
|
||||
pagenames.append(None)
|
||||
for m in re.finditer(_tup_pattern, data):
|
||||
tup = m.group(1)
|
||||
if pageMap != "":
|
||||
pageMap += ","
|
||||
pageMap += "(" + tup + ")"
|
||||
spos, nametype, svalue = tup.split(",")
|
||||
# print(spos, nametype, svalue)
|
||||
if nametype == "a" or nametype == "r":
|
||||
svalue = int(svalue)
|
||||
spos = int(spos)
|
||||
for i in range(spos - 1, numpages):
|
||||
if nametype == "r":
|
||||
pname = int_to_roman(svalue)
|
||||
svalue += 1
|
||||
elif nametype == "a":
|
||||
pname = "%s" % svalue
|
||||
svalue += 1
|
||||
elif nametype == "c":
|
||||
sp = svalue.find("|")
|
||||
if sp == -1:
|
||||
pname = svalue
|
||||
else:
|
||||
pname = svalue[0:sp]
|
||||
svalue = svalue[sp + 1 :]
|
||||
else:
|
||||
logger.debug("Error: unknown page numbering type %s" % nametype)
|
||||
pagenames[i] = pname
|
||||
return pagenames, pageMap
|
||||
|
||||
|
||||
class PageMapProcessor:
|
||||
def __init__(self, mh, data):
|
||||
self.mh = mh
|
||||
self.data = data
|
||||
self.pagenames = []
|
||||
self.pageoffsets = []
|
||||
self.pageMap = ""
|
||||
self.pm_len = 0
|
||||
self.pm_nn = 0
|
||||
self.pn_bits = 0
|
||||
self.pmoff = None
|
||||
self.pmstr = ""
|
||||
logger.debug("Extracting Page Map Information")
|
||||
(rev_len,) = struct.unpack_from(b">L", self.data, 0x10)
|
||||
# skip over header, revision string length data, and revision string
|
||||
ptr = 0x14 + rev_len
|
||||
pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(
|
||||
b">4H", self.data, ptr
|
||||
)
|
||||
# print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
|
||||
self.pmstr = self.data[ptr + 8 : ptr + 8 + self.pm_len]
|
||||
self.pmoff = self.data[ptr + 8 + self.pm_len :]
|
||||
offsize = b">L"
|
||||
offwidth = 4
|
||||
if self.pm_bits == 16:
|
||||
offsize = b">H"
|
||||
offwidth = 2
|
||||
ptr = 0
|
||||
for i in range(self.pm_nn):
|
||||
(od,) = struct.unpack_from(offsize, self.pmoff, ptr)
|
||||
ptr += offwidth
|
||||
self.pageoffsets.append(od)
|
||||
self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
|
||||
|
||||
def getPageMap(self):
|
||||
return self.pageMap
|
||||
|
||||
def getNames(self):
|
||||
return self.pagenames
|
||||
|
||||
def getOffsets(self):
|
||||
return self.pageoffsets
|
||||
|
||||
# page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
|
||||
def generateKF8PageMapXML(self, k8proc):
|
||||
pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
|
||||
for i in range(len(self.pagenames)):
|
||||
pos = self.pageoffsets[i]
|
||||
name = self.pagenames[i]
|
||||
if name is not None and name != "":
|
||||
[pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
|
||||
idtext = unicode_str(k8proc.getPageIDTag(pos))
|
||||
linktgt = unicode_str(filename)
|
||||
if idtext != "":
|
||||
linktgt += "#" + idtext
|
||||
pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
|
||||
pagemapxml += "</page-map>\n"
|
||||
return pagemapxml
|
||||
|
||||
def generateAPNX(self, apnx_meta):
|
||||
if apnx_meta["format"] == "MOBI_8":
|
||||
content_header = (
|
||||
'{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}'
|
||||
% apnx_meta
|
||||
)
|
||||
else:
|
||||
content_header = (
|
||||
'{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}'
|
||||
% apnx_meta
|
||||
)
|
||||
content_header = content_header.encode("utf-8")
|
||||
page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
|
||||
page_header = page_header.encode("utf-8")
|
||||
apnx = struct.pack(b">H", 1) + struct.pack(b">H", 1)
|
||||
apnx += struct.pack(b">I", 12 + len(content_header))
|
||||
apnx += struct.pack(b">I", len(content_header))
|
||||
apnx += content_header
|
||||
apnx += struct.pack(b">H", 1)
|
||||
apnx += struct.pack(b">H", len(page_header))
|
||||
apnx += struct.pack(b">H", self.pm_nn)
|
||||
apnx += struct.pack(b">H", 32)
|
||||
apnx += page_header
|
||||
for page in self.pageoffsets:
|
||||
apnx += struct.pack(b">L", page)
|
||||
return apnx
|
||||
204
mobiparse/mobi/mobi_sectioner.py
Executable file
204
mobiparse/mobi/mobi_sectioner.py
Executable file
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
|
||||
from loguru import logger
|
||||
|
||||
import datetime
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
import struct
|
||||
|
||||
from .unipath import pathof
|
||||
|
||||
DUMP = False
|
||||
""" Set to True to dump all possible information. """
|
||||
|
||||
|
||||
class unpackException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def describe(data):
|
||||
txtans = ""
|
||||
hexans = hexlify(data)
|
||||
for i in data:
|
||||
if bord(i) < 32 or bord(i) > 127:
|
||||
txtans += "?"
|
||||
else:
|
||||
txtans += bchar(i).decode("latin-1")
|
||||
return '"' + txtans + '"' + " 0x" + hexans
|
||||
|
||||
|
||||
def datetimefrompalmtime(palmtime):
|
||||
if palmtime > 0x7FFFFFFF:
|
||||
pythondatetime = datetime.datetime(
|
||||
year=1904, month=1, day=1
|
||||
) + datetime.timedelta(seconds=palmtime)
|
||||
else:
|
||||
pythondatetime = datetime.datetime(
|
||||
year=1970, month=1, day=1
|
||||
) + datetime.timedelta(seconds=palmtime)
|
||||
return pythondatetime
|
||||
|
||||
|
||||
class Sectionizer:
|
||||
def __init__(self, filename):
|
||||
self.data = b""
|
||||
with open(pathof(filename), "rb") as f:
|
||||
self.data = f.read()
|
||||
self.palmheader = self.data[:78]
|
||||
self.palmname = self.data[:32]
|
||||
self.ident = self.palmheader[0x3C : 0x3C + 8]
|
||||
# CG struct.unpack_from(fmt, buffer, offset=0)
|
||||
(self.num_sections,) = struct.unpack_from(b">H", self.palmheader, 76)
|
||||
self.filelength = len(self.data)
|
||||
|
||||
## CGDBG ???
|
||||
## sectionsdata (9680, 0, 18618, 2, 22275, 4, 25504, 6, 28607, 8,...
|
||||
sectionsdata = struct.unpack_from( bstr(">%dL" % (self.num_sections * 2)), self.data, 78
|
||||
) + (self.filelength, 0)
|
||||
|
||||
## 所有section的offset和长度
|
||||
# sectionsoffset (9680, 18618, 22275, 25504, 28607, ...
|
||||
self.sectionoffsets = sectionsdata[::2]
|
||||
# ectionattributes (0, 2, 4, 6, 8, ...
|
||||
self.sectionattributes = sectionsdata[1::2]
|
||||
self.sectiondescriptions = ["" for x in range(self.num_sections + 1)]
|
||||
self.sectiondescriptions[-1] = "File Length Only"
|
||||
|
||||
# CGDBG upack_from 返回什么?tuple (,)
|
||||
print( 'sectionsdata {} {}'.format(sectionsdata, bstr(">%dL" % (self.num_sections * 2))))
|
||||
print( 'sectionsoffset {} \n sectionattributes {}'.format( self.sectionoffsets, self.sectionattributes ))
|
||||
print( 'sectionsdescriptions {} '.format( self.sectiondescriptions))
|
||||
print( bstr(">%dL" % (self.num_sections * 2) ) )
|
||||
print( struct.unpack_from(bstr(">%dL" % (self.num_sections * 2)) , self.data, 78) )
|
||||
print( (self.filelength, 0) )
|
||||
|
||||
return
|
||||
|
||||
# sections information
|
||||
def dumpsectionsinfo(self):
|
||||
logger.debug("Section Offset Length UID Attribs Description")
|
||||
for i in range(self.num_sections):
|
||||
'''
|
||||
logger.debug(
|
||||
"{} {} {} {} {} {} {}\n".format( i, i,
|
||||
self.sectionoffsets[i],
|
||||
self.sectionoffsets[i + 1] - self.sectionoffsets[i],
|
||||
self.sectionattributes[i] & 0xFFFFFF,
|
||||
(self.sectionattributes[i] >> 24) & 0xFF,
|
||||
self.sectiondescriptions[i]))
|
||||
'''
|
||||
logger.debug(
|
||||
"%3d %3X 0x%07X 0x%05X % 8d % 7d %s"
|
||||
% (
|
||||
i,
|
||||
i,
|
||||
self.sectionoffsets[i],
|
||||
self.sectionoffsets[i + 1] - self.sectionoffsets[i],
|
||||
self.sectionattributes[i] & 0xFFFFFF,
|
||||
(self.sectionattributes[i] >> 24) & 0xFF,
|
||||
self.sectiondescriptions[i],
|
||||
)
|
||||
)
|
||||
logger.debug(
|
||||
"%3d %3X 0x%07X %s"
|
||||
% (
|
||||
self.num_sections,
|
||||
self.num_sections,
|
||||
self.sectionoffsets[self.num_sections],
|
||||
self.sectiondescriptions[self.num_sections],
|
||||
)
|
||||
)
|
||||
|
||||
def setsectiondescription(self, section, description):
|
||||
if section < len(self.sectiondescriptions):
|
||||
self.sectiondescriptions[section] = description
|
||||
else:
|
||||
logger.debug(
|
||||
"Section out of range: %d, description %s" % (section, description)
|
||||
)
|
||||
|
||||
def dumppalmheader(self):
|
||||
logger.debug("Palm Database Header")
|
||||
logger.debug("Database name: " + repr(self.palmheader[:32]))
|
||||
(dbattributes,) = struct.unpack_from(b">H", self.palmheader, 32)
|
||||
logger.debug("Bitfield attributes: 0x%0X" % dbattributes,)
|
||||
if dbattributes != 0:
|
||||
print(" (",)
|
||||
if dbattributes & 2:
|
||||
print("Read-only; ",)
|
||||
if dbattributes & 4:
|
||||
print("Dirty AppInfoArea; ",)
|
||||
if dbattributes & 8:
|
||||
print("Needs to be backed up; ",)
|
||||
if dbattributes & 16:
|
||||
print("OK to install over newer; ",)
|
||||
if dbattributes & 32:
|
||||
print("Reset after installation; ",)
|
||||
if dbattributes & 64:
|
||||
print("No copying by PalmPilot beaming; ",)
|
||||
print(")")
|
||||
else:
|
||||
print("")
|
||||
logger.debug(
|
||||
"File version: %d" % struct.unpack_from(b">H", self.palmheader, 34)[0]
|
||||
)
|
||||
(dbcreation,) = struct.unpack_from(b">L", self.palmheader, 36)
|
||||
logger.debug(
|
||||
"Creation Date: "
|
||||
+ str(datetimefrompalmtime(dbcreation))
|
||||
+ (" (0x%0X)" % dbcreation)
|
||||
)
|
||||
(dbmodification,) = struct.unpack_from(b">L", self.palmheader, 40)
|
||||
logger.debug(
|
||||
"Modification Date: "
|
||||
+ str(datetimefrompalmtime(dbmodification))
|
||||
+ (" (0x%0X)" % dbmodification)
|
||||
)
|
||||
(dbbackup,) = struct.unpack_from(b">L", self.palmheader, 44)
|
||||
if dbbackup != 0:
|
||||
logger.debug(
|
||||
"Backup Date: "
|
||||
+ str(datetimefrompalmtime(dbbackup))
|
||||
+ (" (0x%0X)" % dbbackup)
|
||||
)
|
||||
logger.debug(
|
||||
"Modification No.: %d" % struct.unpack_from(b">L", self.palmheader, 48)[0]
|
||||
)
|
||||
logger.debug(
|
||||
"App Info offset: 0x%0X" % struct.unpack_from(b">L", self.palmheader, 52)[0]
|
||||
)
|
||||
logger.debug(
|
||||
"Sort Info offset: 0x%0X"
|
||||
% struct.unpack_from(b">L", self.palmheader, 56)[0]
|
||||
)
|
||||
logger.debug(
|
||||
"Type/Creator: %s/%s"
|
||||
% (repr(self.palmheader[60:64]), repr(self.palmheader[64:68]))
|
||||
)
|
||||
logger.debug(
|
||||
"Unique seed: 0x%0X" % struct.unpack_from(b">L", self.palmheader, 68)[0]
|
||||
)
|
||||
(expectedzero,) = struct.unpack_from(b">L", self.palmheader, 72)
|
||||
if expectedzero != 0:
|
||||
logger.debug(
|
||||
"Should be zero but isn't: %d"
|
||||
% struct.unpack_from(b">L", self.palmheader, 72)[0]
|
||||
)
|
||||
logger.debug(
|
||||
"Number of sections: %d" % struct.unpack_from(b">H", self.palmheader, 76)[0]
|
||||
)
|
||||
return
|
||||
|
||||
def loadSection(self, section):
|
||||
before, after = self.sectionoffsets[section : section + 2]
|
||||
return self.data[before:after]
|
||||
505
mobiparse/mobi/mobi_split.py
Executable file
505
mobiparse/mobi/mobi_split.py
Executable file
@@ -0,0 +1,505 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
from loguru import logger
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
from .unipath import pathof
|
||||
|
||||
|
||||
# CG : reference https://wiki.mobileread.com/wiki/MOBI
|
||||
# important pdb header offsets
|
||||
unique_id_seed = 68
|
||||
number_of_pdb_records = 76
|
||||
|
||||
# important palmdoc header offsets
|
||||
book_length = 4
|
||||
book_record_count = 8
|
||||
first_pdb_record = 78
|
||||
|
||||
# important rec0 offsets
|
||||
length_of_book = 4
|
||||
mobi_header_base = 16
|
||||
mobi_header_length = 20
|
||||
mobi_type = 24
|
||||
mobi_version = 36
|
||||
first_non_text = 80
|
||||
title_offset = 84
|
||||
first_resc_record = 108
|
||||
first_content_index = 192
|
||||
last_content_index = 194
|
||||
kf8_fdst_index = 192 # for KF8 mobi headers
|
||||
fcis_index = 200
|
||||
flis_index = 208
|
||||
srcs_index = 224
|
||||
srcs_count = 228
|
||||
primary_index = 244
|
||||
datp_index = 256
|
||||
huffoff = 112
|
||||
hufftbloff = 120
|
||||
|
||||
|
||||
def getint(datain, ofs, sz=b"L"):
|
||||
(i,) = struct.unpack_from(b">" + sz, datain, ofs)
|
||||
return i
|
||||
|
||||
|
||||
def writeint(datain, ofs, n, len=b"L"):
|
||||
if len == b"L":
|
||||
return datain[:ofs] + struct.pack(b">L", n) + datain[ofs + 4 :]
|
||||
else:
|
||||
return datain[:ofs] + struct.pack(b">H", n) + datain[ofs + 2 :]
|
||||
|
||||
|
||||
def getsecaddr(datain, secno):
|
||||
nsec = getint(datain, number_of_pdb_records, b"H")
|
||||
assert secno >= 0 & secno < nsec, "secno %d out of range (nsec=%d)" % (secno, nsec)
|
||||
secstart = getint(datain, first_pdb_record + secno * 8)
|
||||
if secno == nsec - 1:
|
||||
secend = len(datain)
|
||||
else:
|
||||
secend = getint(datain, first_pdb_record + (secno + 1) * 8)
|
||||
return secstart, secend
|
||||
|
||||
|
||||
def readsection(datain, secno):
|
||||
secstart, secend = getsecaddr(datain, secno)
|
||||
return datain[secstart:secend]
|
||||
|
||||
|
||||
def writesection(datain, secno, secdata): # overwrite, accounting for different length
|
||||
# dataout = deletesectionrange(datain,secno, secno)
|
||||
# return insertsection(dataout, secno, secdata)
|
||||
datalst = []
|
||||
nsec = getint(datain, number_of_pdb_records, b"H")
|
||||
zerosecstart, zerosecend = getsecaddr(datain, 0)
|
||||
secstart, secend = getsecaddr(datain, secno)
|
||||
dif = len(secdata) - (secend - secstart)
|
||||
datalst.append(datain[:unique_id_seed])
|
||||
datalst.append(struct.pack(b">L", 2 * nsec + 1))
|
||||
datalst.append(datain[unique_id_seed + 4 : number_of_pdb_records])
|
||||
datalst.append(struct.pack(b">H", nsec))
|
||||
newstart = zerosecstart
|
||||
for i in range(0, secno):
|
||||
ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
|
||||
datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
|
||||
datalst.append(struct.pack(b">L", secstart) + struct.pack(b">L", (2 * secno)))
|
||||
for i in range(secno + 1, nsec):
|
||||
ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
|
||||
ofs = ofs + dif
|
||||
datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
|
||||
lpad = newstart - (first_pdb_record + 8 * nsec)
|
||||
if lpad > 0:
|
||||
datalst.append(b"\0" * lpad)
|
||||
datalst.append(datain[zerosecstart:secstart])
|
||||
datalst.append(secdata)
|
||||
datalst.append(datain[secend:])
|
||||
dataout = b"".join(datalst)
|
||||
return dataout
|
||||
|
||||
|
||||
def nullsection(datain, secno): # make it zero-length without deleting it
|
||||
datalst = []
|
||||
nsec = getint(datain, number_of_pdb_records, b"H")
|
||||
secstart, secend = getsecaddr(datain, secno)
|
||||
zerosecstart, zerosecend = getsecaddr(datain, 0)
|
||||
dif = secend - secstart
|
||||
datalst.append(datain[:first_pdb_record])
|
||||
for i in range(0, secno + 1):
|
||||
ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
|
||||
datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
|
||||
for i in range(secno + 1, nsec):
|
||||
ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
|
||||
ofs = ofs - dif
|
||||
datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
|
||||
lpad = zerosecstart - (first_pdb_record + 8 * nsec)
|
||||
if lpad > 0:
|
||||
datalst.append(b"\0" * lpad)
|
||||
datalst.append(datain[zerosecstart:secstart])
|
||||
datalst.append(datain[secend:])
|
||||
dataout = b"".join(datalst)
|
||||
return dataout
|
||||
|
||||
|
||||
def deletesectionrange(datain, firstsec, lastsec): # delete a range of sections
|
||||
datalst = []
|
||||
firstsecstart, firstsecend = getsecaddr(datain, firstsec)
|
||||
lastsecstart, lastsecend = getsecaddr(datain, lastsec)
|
||||
zerosecstart, zerosecend = getsecaddr(datain, 0)
|
||||
dif = lastsecend - firstsecstart + 8 * (lastsec - firstsec + 1)
|
||||
nsec = getint(datain, number_of_pdb_records, b"H")
|
||||
datalst.append(datain[:unique_id_seed])
|
||||
datalst.append(struct.pack(b">L", 2 * (nsec - (lastsec - firstsec + 1)) + 1))
|
||||
datalst.append(datain[unique_id_seed + 4 : number_of_pdb_records])
|
||||
datalst.append(struct.pack(b">H", nsec - (lastsec - firstsec + 1)))
|
||||
newstart = zerosecstart - 8 * (lastsec - firstsec + 1)
|
||||
for i in range(0, firstsec):
|
||||
ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
|
||||
ofs = ofs - 8 * (lastsec - firstsec + 1)
|
||||
datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
|
||||
for i in range(lastsec + 1, nsec):
|
||||
ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
|
||||
ofs = ofs - dif
|
||||
flgval = 2 * (i - (lastsec - firstsec + 1))
|
||||
datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
|
||||
lpad = newstart - (first_pdb_record + 8 * (nsec - (lastsec - firstsec + 1)))
|
||||
if lpad > 0:
|
||||
datalst.append(b"\0" * lpad)
|
||||
datalst.append(datain[zerosecstart:firstsecstart])
|
||||
datalst.append(datain[lastsecend:])
|
||||
dataout = b"".join(datalst)
|
||||
return dataout
|
||||
|
||||
|
||||
def insertsection(datain, secno, secdata): # insert a new section
|
||||
datalst = []
|
||||
nsec = getint(datain, number_of_pdb_records, b"H")
|
||||
# print("inserting secno" , secno, "into" ,nsec, "sections")
|
||||
secstart, secend = getsecaddr(datain, secno)
|
||||
zerosecstart, zerosecend = getsecaddr(datain, 0)
|
||||
dif = len(secdata)
|
||||
datalst.append(datain[:unique_id_seed])
|
||||
datalst.append(struct.pack(b">L", 2 * (nsec + 1) + 1))
|
||||
datalst.append(datain[unique_id_seed + 4 : number_of_pdb_records])
|
||||
datalst.append(struct.pack(b">H", nsec + 1))
|
||||
newstart = zerosecstart + 8
|
||||
for i in range(0, secno):
|
||||
ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
|
||||
ofs += 8
|
||||
datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
|
||||
datalst.append(struct.pack(b">L", secstart + 8) + struct.pack(b">L", (2 * secno)))
|
||||
for i in range(secno, nsec):
|
||||
ofs, flgval = struct.unpack_from(b">2L", datain, first_pdb_record + i * 8)
|
||||
ofs = ofs + dif + 8
|
||||
flgval = 2 * (i + 1)
|
||||
datalst.append(struct.pack(b">L", ofs) + struct.pack(b">L", flgval))
|
||||
lpad = newstart - (first_pdb_record + 8 * (nsec + 1))
|
||||
if lpad > 0:
|
||||
datalst.append(b"\0" * lpad)
|
||||
datalst.append(datain[zerosecstart:secstart])
|
||||
datalst.append(secdata)
|
||||
datalst.append(datain[secstart:])
|
||||
dataout = b"".join(datalst)
|
||||
return dataout
|
||||
|
||||
|
||||
def insertsectionrange(
|
||||
sectionsource, firstsec, lastsec, sectiontarget, targetsec
|
||||
): # insert a range of sections
|
||||
# print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections")
|
||||
# dataout = sectiontarget
|
||||
# for idx in range(lastsec,firstsec-1,-1):
|
||||
# dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
|
||||
# return dataout
|
||||
datalst = []
|
||||
nsec = getint(sectiontarget, number_of_pdb_records, b"H")
|
||||
zerosecstart, zerosecend = getsecaddr(sectiontarget, 0)
|
||||
insstart, nul = getsecaddr(sectiontarget, targetsec)
|
||||
nins = lastsec - firstsec + 1
|
||||
srcstart, nul = getsecaddr(sectionsource, firstsec)
|
||||
nul, srcend = getsecaddr(sectionsource, lastsec)
|
||||
newstart = zerosecstart + 8 * nins
|
||||
|
||||
datalst.append(sectiontarget[:unique_id_seed])
|
||||
datalst.append(struct.pack(b">L", 2 * (nsec + nins) + 1))
|
||||
datalst.append(sectiontarget[unique_id_seed + 4 : number_of_pdb_records])
|
||||
datalst.append(struct.pack(b">H", nsec + nins))
|
||||
for i in range(0, targetsec):
|
||||
ofs, flgval = struct.unpack_from(
|
||||
b">2L", sectiontarget, first_pdb_record + i * 8
|
||||
)
|
||||
ofsnew = ofs + 8 * nins
|
||||
flgvalnew = flgval
|
||||
datalst.append(struct.pack(b">L", ofsnew) + struct.pack(b">L", flgvalnew))
|
||||
# print(ofsnew, flgvalnew, ofs, flgval)
|
||||
srcstart0, nul = getsecaddr(sectionsource, firstsec)
|
||||
for i in range(nins):
|
||||
isrcstart, nul = getsecaddr(sectionsource, firstsec + i)
|
||||
ofsnew = insstart + (isrcstart - srcstart0) + 8 * nins
|
||||
flgvalnew = 2 * (targetsec + i)
|
||||
datalst.append(struct.pack(b">L", ofsnew) + struct.pack(b">L", flgvalnew))
|
||||
# print(ofsnew, flgvalnew)
|
||||
dif = srcend - srcstart
|
||||
for i in range(targetsec, nsec):
|
||||
ofs, flgval = struct.unpack_from(
|
||||
b">2L", sectiontarget, first_pdb_record + i * 8
|
||||
)
|
||||
ofsnew = ofs + dif + 8 * nins
|
||||
flgvalnew = 2 * (i + nins)
|
||||
datalst.append(struct.pack(b">L", ofsnew) + struct.pack(b">L", flgvalnew))
|
||||
# print(ofsnew, flgvalnew, ofs, flgval)
|
||||
lpad = newstart - (first_pdb_record + 8 * (nsec + nins))
|
||||
if lpad > 0:
|
||||
datalst.append(b"\0" * lpad)
|
||||
datalst.append(sectiontarget[zerosecstart:insstart])
|
||||
datalst.append(sectionsource[srcstart:srcend])
|
||||
datalst.append(sectiontarget[insstart:])
|
||||
dataout = b"".join(datalst)
|
||||
return dataout
|
||||
|
||||
|
||||
def get_exth_params(rec0):
|
||||
ebase = mobi_header_base + getint(rec0, mobi_header_length)
|
||||
elen = getint(rec0, ebase + 4)
|
||||
enum = getint(rec0, ebase + 8)
|
||||
return ebase, elen, enum
|
||||
|
||||
|
||||
def add_exth(rec0, exth_num, exth_bytes):
|
||||
ebase, elen, enum = get_exth_params(rec0)
|
||||
newrecsize = 8 + len(exth_bytes)
|
||||
newrec0 = (
|
||||
rec0[0 : ebase + 4]
|
||||
+ struct.pack(b">L", elen + newrecsize)
|
||||
+ struct.pack(b">L", enum + 1)
|
||||
+ struct.pack(b">L", exth_num)
|
||||
+ struct.pack(b">L", newrecsize)
|
||||
+ exth_bytes
|
||||
+ rec0[ebase + 12 :]
|
||||
)
|
||||
newrec0 = writeint(
|
||||
newrec0, title_offset, getint(newrec0, title_offset) + newrecsize
|
||||
)
|
||||
return newrec0
|
||||
|
||||
|
||||
def read_exth(rec0, exth_num):
|
||||
exth_values = []
|
||||
ebase, elen, enum = get_exth_params(rec0)
|
||||
ebase = ebase + 12
|
||||
while enum > 0:
|
||||
exth_id = getint(rec0, ebase)
|
||||
if exth_id == exth_num:
|
||||
# We might have multiple exths, so build a list.
|
||||
exth_values.append(rec0[ebase + 8 : ebase + getint(rec0, ebase + 4)])
|
||||
enum = enum - 1
|
||||
ebase = ebase + getint(rec0, ebase + 4)
|
||||
return exth_values
|
||||
|
||||
|
||||
def write_exth(rec0, exth_num, exth_bytes):
|
||||
ebase, elen, enum = get_exth_params(rec0)
|
||||
ebase_idx = ebase + 12
|
||||
enum_idx = enum
|
||||
while enum_idx > 0:
|
||||
exth_id = getint(rec0, ebase_idx)
|
||||
if exth_id == exth_num:
|
||||
dif = len(exth_bytes) + 8 - getint(rec0, ebase_idx + 4)
|
||||
newrec0 = rec0
|
||||
if dif != 0:
|
||||
newrec0 = writeint(
|
||||
newrec0, title_offset, getint(newrec0, title_offset) + dif
|
||||
)
|
||||
return (
|
||||
newrec0[: ebase + 4]
|
||||
+ struct.pack(
|
||||
b">L", elen + len(exth_bytes) + 8 - getint(rec0, ebase_idx + 4)
|
||||
)
|
||||
+ struct.pack(b">L", enum)
|
||||
+ rec0[ebase + 12 : ebase_idx + 4]
|
||||
+ struct.pack(b">L", len(exth_bytes) + 8)
|
||||
+ exth_bytes
|
||||
+ rec0[ebase_idx + getint(rec0, ebase_idx + 4) :]
|
||||
)
|
||||
enum_idx = enum_idx - 1
|
||||
ebase_idx = ebase_idx + getint(rec0, ebase_idx + 4)
|
||||
return rec0
|
||||
|
||||
|
||||
def del_exth(rec0, exth_num):
|
||||
ebase, elen, enum = get_exth_params(rec0)
|
||||
ebase_idx = ebase + 12
|
||||
enum_idx = 0
|
||||
while enum_idx < enum:
|
||||
exth_id = getint(rec0, ebase_idx)
|
||||
exth_size = getint(rec0, ebase_idx + 4)
|
||||
if exth_id == exth_num:
|
||||
newrec0 = rec0
|
||||
newrec0 = writeint(
|
||||
newrec0, title_offset, getint(newrec0, title_offset) - exth_size
|
||||
)
|
||||
newrec0 = newrec0[:ebase_idx] + newrec0[ebase_idx + exth_size :]
|
||||
newrec0 = (
|
||||
newrec0[0 : ebase + 4]
|
||||
+ struct.pack(b">L", elen - exth_size)
|
||||
+ struct.pack(b">L", enum - 1)
|
||||
+ newrec0[ebase + 12 :]
|
||||
)
|
||||
return newrec0
|
||||
enum_idx += 1
|
||||
ebase_idx = ebase_idx + exth_size
|
||||
return rec0
|
||||
|
||||
|
||||
class mobi_split:
|
||||
def __init__(self, infile):
|
||||
datain = b""
|
||||
with open(pathof(infile), "rb") as f:
|
||||
datain = f.read()
|
||||
datain_rec0 = readsection(datain, 0)
|
||||
ver = getint(datain_rec0, mobi_version)
|
||||
self.combo = ver != 8
|
||||
if not self.combo:
|
||||
return
|
||||
exth121 = read_exth(datain_rec0, 121)
|
||||
if len(exth121) == 0:
|
||||
self.combo = False
|
||||
return
|
||||
else:
|
||||
# only pay attention to first exth121
|
||||
# (there should only be one)
|
||||
(datain_kf8,) = struct.unpack_from(b">L", exth121[0], 0)
|
||||
if datain_kf8 == 0xFFFFFFFF:
|
||||
self.combo = False
|
||||
return
|
||||
datain_kfrec0 = readsection(datain, datain_kf8)
|
||||
|
||||
# create the standalone mobi7
|
||||
num_sec = getint(datain, number_of_pdb_records, b"H")
|
||||
# remove BOUNDARY up to but not including ELF record
|
||||
self.result_file7 = deletesectionrange(datain, datain_kf8 - 1, num_sec - 2)
|
||||
# check if there are SRCS records and delete them
|
||||
srcs = getint(datain_rec0, srcs_index)
|
||||
num_srcs = getint(datain_rec0, srcs_count)
|
||||
if srcs != 0xFFFFFFFF and num_srcs > 0:
|
||||
self.result_file7 = deletesectionrange(
|
||||
self.result_file7, srcs, srcs + num_srcs - 1
|
||||
)
|
||||
datain_rec0 = writeint(datain_rec0, srcs_index, 0xFFFFFFFF)
|
||||
datain_rec0 = writeint(datain_rec0, srcs_count, 0)
|
||||
# reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
|
||||
datain_rec0 = write_exth(datain_rec0, 121, struct.pack(b">L", 0xFFFFFFFF))
|
||||
# datain_rec0 = del_exth(datain_rec0,121)
|
||||
# datain_rec0 = del_exth(datain_rec0,534)
|
||||
# don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
|
||||
# set the EXTH 129 KF8 Masthead / Cover Image string to the null string
|
||||
datain_rec0 = write_exth(datain_rec0, 129, b"")
|
||||
# don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
|
||||
|
||||
# need to reset flags stored in 0x80-0x83
|
||||
# old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
|
||||
# Bit Flags
|
||||
# 0x1000 = Bit 12 indicates if embedded fonts are used or not
|
||||
# 0x0800 = means this Header points to *shared* images/resource/fonts ??
|
||||
# 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
|
||||
# 0x0040 = exth exists
|
||||
# 0x0010 = Not sure but this is always set so far
|
||||
(fval,) = struct.unpack_from(b">L", datain_rec0, 0x80)
|
||||
# need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
|
||||
fval = fval & 0x07FF
|
||||
datain_rec0 = datain_rec0[:0x80] + struct.pack(b">L", fval) + datain_rec0[0x84:]
|
||||
|
||||
self.result_file7 = writesection(self.result_file7, 0, datain_rec0)
|
||||
|
||||
# no need to replace kf8 style fcis with mobi 7 one
|
||||
# fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
|
||||
# if fcis_secnum != 0xffffffff:
|
||||
# fcis_info = readsection(datain, fcis_secnum)
|
||||
# text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
|
||||
# new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
|
||||
# new_fcis += struct.pack(b'>L',text_len)
|
||||
# new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
|
||||
# self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
|
||||
|
||||
firstimage = getint(datain_rec0, first_resc_record)
|
||||
lastimage = getint(datain_rec0, last_content_index, b"H")
|
||||
# print("Old First Image, last Image", firstimage,lastimage)
|
||||
if lastimage == 0xFFFF:
|
||||
# find the lowest of the next sections and copy up to that.
|
||||
ofs_list = [
|
||||
(fcis_index, b"L"),
|
||||
(flis_index, b"L"),
|
||||
(datp_index, b"L"),
|
||||
(hufftbloff, b"L"),
|
||||
]
|
||||
for ofs, sz in ofs_list:
|
||||
n = getint(datain_rec0, ofs, sz)
|
||||
# print("n",n)
|
||||
if n > 0 and n < lastimage:
|
||||
lastimage = n - 1
|
||||
logger.debug("First Image, last Image %s %s" % (firstimage, lastimage))
|
||||
|
||||
# Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
|
||||
for i in range(firstimage, lastimage):
|
||||
imgsec = readsection(self.result_file7, i)
|
||||
if imgsec[0:4] in [b"RESC", b"FONT"]:
|
||||
self.result_file7 = nullsection(self.result_file7, i)
|
||||
|
||||
# mobi7 finished
|
||||
|
||||
# create standalone mobi8
|
||||
self.result_file8 = deletesectionrange(datain, 0, datain_kf8 - 1)
|
||||
target = getint(datain_kfrec0, first_resc_record)
|
||||
self.result_file8 = insertsectionrange(
|
||||
datain, firstimage, lastimage, self.result_file8, target
|
||||
)
|
||||
datain_kfrec0 = readsection(self.result_file8, 0)
|
||||
|
||||
# Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
|
||||
kf8starts = read_exth(datain_kfrec0, 116)
|
||||
# If we have multiple StartOffset, keep only the last one
|
||||
kf8start_count = len(kf8starts)
|
||||
while kf8start_count > 1:
|
||||
kf8start_count -= 1
|
||||
datain_kfrec0 = del_exth(datain_kfrec0, 116)
|
||||
|
||||
# update the EXTH 125 KF8 Count of Images/Fonts/Resources
|
||||
datain_kfrec0 = write_exth(
|
||||
datain_kfrec0, 125, struct.pack(b">L", lastimage - firstimage + 1)
|
||||
)
|
||||
|
||||
# need to reset flags stored in 0x80-0x83
|
||||
# old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
|
||||
# standalone mobi8 with exth: 0x0050
|
||||
# Bit Flags
|
||||
# 0x1000 = Bit 12 indicates if embedded fonts are used or not
|
||||
# 0x0800 = means this Header points to *shared* images/resource/fonts ??
|
||||
# 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
|
||||
# 0x0040 = exth exists
|
||||
# 0x0010 = Not sure but this is always set so far
|
||||
(fval,) = struct.unpack_from(">L", datain_kfrec0, 0x80)
|
||||
fval = fval & 0x1FFF
|
||||
fval |= 0x0800
|
||||
datain_kfrec0 = (
|
||||
datain_kfrec0[:0x80] + struct.pack(b">L", fval) + datain_kfrec0[0x84:]
|
||||
)
|
||||
|
||||
# properly update other index pointers that have been shifted by the insertion of images
|
||||
ofs_list = [
|
||||
(kf8_fdst_index, b"L"),
|
||||
(fcis_index, b"L"),
|
||||
(flis_index, b"L"),
|
||||
(datp_index, b"L"),
|
||||
(hufftbloff, b"L"),
|
||||
]
|
||||
for ofs, sz in ofs_list:
|
||||
n = getint(datain_kfrec0, ofs, sz)
|
||||
if n != 0xFFFFFFFF:
|
||||
datain_kfrec0 = writeint(
|
||||
datain_kfrec0, ofs, n + lastimage - firstimage + 1, sz
|
||||
)
|
||||
self.result_file8 = writesection(self.result_file8, 0, datain_kfrec0)
|
||||
|
||||
# no need to replace kf8 style fcis with mobi 7 one
|
||||
# fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
|
||||
# if fcis_secnum != 0xffffffff:
|
||||
# fcis_info = readsection(self.result_file8, fcis_secnum)
|
||||
# text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
|
||||
# new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
|
||||
# new_fcis += struct.pack(b'>L',text_len)
|
||||
# new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
|
||||
# self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
|
||||
|
||||
# mobi8 finished
|
||||
|
||||
def getResult8(self):
|
||||
return self.result_file8
|
||||
|
||||
def getResult7(self):
|
||||
return self.result_file7
|
||||
138
mobiparse/mobi/mobi_uncompress.py
Executable file
138
mobiparse/mobi/mobi_uncompress.py
Executable file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, bchr, lmap, bstr
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
|
||||
class unpackException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class UncompressedReader:
|
||||
def unpack(self, data):
|
||||
return data
|
||||
|
||||
|
||||
class PalmdocReader:
|
||||
def unpack(self, i):
|
||||
o, p = b"", 0
|
||||
while p < len(i):
|
||||
# for python 3 must use slice since i[p] returns int while slice returns character
|
||||
c = ord(i[p : p + 1])
|
||||
p += 1
|
||||
if c >= 1 and c <= 8:
|
||||
o += i[p : p + c]
|
||||
p += c
|
||||
elif c < 128:
|
||||
o += bchr(c)
|
||||
elif c >= 192:
|
||||
o += b" " + bchr(c ^ 128)
|
||||
else:
|
||||
if p < len(i):
|
||||
c = (c << 8) | ord(i[p : p + 1])
|
||||
p += 1
|
||||
m = (c >> 3) & 0x07FF
|
||||
n = (c & 7) + 3
|
||||
if m > n:
|
||||
o += o[-m : n - m]
|
||||
else:
|
||||
for _ in range(n):
|
||||
# because of completely ass-backwards decision by python mainters for python 3
|
||||
# we must use slice for bytes as i[p] returns int while slice returns character
|
||||
if m == 1:
|
||||
o += o[-m:]
|
||||
else:
|
||||
o += o[-m : -m + 1]
|
||||
return o
|
||||
|
||||
|
||||
class HuffcdicReader:
|
||||
q = struct.Struct(b">Q").unpack_from
|
||||
|
||||
def loadHuff(self, huff):
|
||||
if huff[0:8] != b"HUFF\x00\x00\x00\x18":
|
||||
raise unpackException("invalid huff header")
|
||||
off1, off2 = struct.unpack_from(b">LL", huff, 8)
|
||||
|
||||
def dict1_unpack(v):
|
||||
codelen, term, maxcode = v & 0x1F, v & 0x80, v >> 8
|
||||
assert codelen != 0
|
||||
if codelen <= 8:
|
||||
assert term
|
||||
maxcode = ((maxcode + 1) << (32 - codelen)) - 1
|
||||
return (codelen, term, maxcode)
|
||||
|
||||
self.dict1 = lmap(dict1_unpack, struct.unpack_from(b">256L", huff, off1))
|
||||
|
||||
dict2 = struct.unpack_from(b">64L", huff, off2)
|
||||
self.mincode, self.maxcode = (), ()
|
||||
for codelen, mincode in enumerate((0,) + dict2[0::2]):
|
||||
self.mincode += (mincode << (32 - codelen),)
|
||||
for codelen, maxcode in enumerate((0,) + dict2[1::2]):
|
||||
self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1,)
|
||||
|
||||
self.dictionary = []
|
||||
|
||||
def loadCdic(self, cdic):
|
||||
if cdic[0:8] != b"CDIC\x00\x00\x00\x10":
|
||||
raise unpackException("invalid cdic header")
|
||||
phrases, bits = struct.unpack_from(b">LL", cdic, 8)
|
||||
n = min(1 << bits, phrases - len(self.dictionary))
|
||||
h = struct.Struct(b">H").unpack_from
|
||||
|
||||
def getslice(off):
|
||||
(blen,) = h(cdic, 16 + off)
|
||||
slice = cdic[18 + off : 18 + off + (blen & 0x7FFF)]
|
||||
return (slice, blen & 0x8000)
|
||||
|
||||
self.dictionary += lmap(
|
||||
getslice, struct.unpack_from(bstr(">%dH" % n), cdic, 16)
|
||||
)
|
||||
|
||||
def unpack(self, data):
|
||||
q = HuffcdicReader.q
|
||||
|
||||
bitsleft = len(data) * 8
|
||||
data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
|
||||
pos = 0
|
||||
(x,) = q(data, pos)
|
||||
n = 32
|
||||
|
||||
s = b""
|
||||
while True:
|
||||
if n <= 0:
|
||||
pos += 4
|
||||
(x,) = q(data, pos)
|
||||
n += 32
|
||||
code = (x >> n) & ((1 << 32) - 1)
|
||||
|
||||
codelen, term, maxcode = self.dict1[code >> 24]
|
||||
if not term:
|
||||
while code < self.mincode[codelen]:
|
||||
codelen += 1
|
||||
maxcode = self.maxcode[codelen]
|
||||
|
||||
n -= codelen
|
||||
bitsleft -= codelen
|
||||
if bitsleft < 0:
|
||||
break
|
||||
|
||||
r = (maxcode - code) >> (32 - codelen)
|
||||
slice, flag = self.dictionary[r]
|
||||
if not flag:
|
||||
self.dictionary[r] = None
|
||||
slice = self.unpack(slice)
|
||||
self.dictionary[r] = (slice, 1)
|
||||
s += slice
|
||||
return s
|
||||
252
mobiparse/mobi/mobi_utils.py
Executable file
252
mobiparse/mobi/mobi_utils.py
Executable file
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
# flake8: noqa
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, text_type, bchr, bord
|
||||
|
||||
import binascii
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
from itertools import cycle
|
||||
|
||||
|
||||
def getLanguage(langID, sublangID):
|
||||
mobilangdict = {
|
||||
54: {0: "af"}, # Afrikaans
|
||||
28: {0: "sq"}, # Albanian
|
||||
1: {
|
||||
0: "ar",
|
||||
5: "ar-dz",
|
||||
15: "ar-bh",
|
||||
3: "ar-eg",
|
||||
2: "ar-iq",
|
||||
11: "ar-jo",
|
||||
13: "ar-kw",
|
||||
12: "ar-lb",
|
||||
4: "ar-ly",
|
||||
6: "ar-ma",
|
||||
8: "ar-om",
|
||||
16: "ar-qa",
|
||||
1: "ar-sa",
|
||||
10: "ar-sy",
|
||||
7: "ar-tn",
|
||||
14: "ar-ae",
|
||||
9: "ar-ye",
|
||||
},
|
||||
# Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic
|
||||
# (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic
|
||||
# (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic
|
||||
# (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab
|
||||
# Emirates), Arabic (Yemen)
|
||||
43: {0: "hy"}, # Armenian
|
||||
77: {0: "as"}, # Assamese
|
||||
44: {0: "az"}, # "Azeri (IANA: Azerbaijani)
|
||||
45: {0: "eu"}, # Basque
|
||||
35: {0: "be"}, # Belarusian
|
||||
69: {0: "bn"}, # Bengali
|
||||
2: {0: "bg"}, # Bulgarian
|
||||
3: {0: "ca"}, # Catalan
|
||||
4: {0: "zh", 3: "zh-hk", 2: "zh-cn", 4: "zh-sg", 1: "zh-tw"},
|
||||
# Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan)
|
||||
26: {0: "hr", 3: "sr"}, # Croatian, Serbian
|
||||
5: {0: "cs"}, # Czech
|
||||
6: {0: "da"}, # Danish
|
||||
19: {0: "nl", 1: "nl", 2: "nl-be"}, # Dutch / Flemish, Dutch (Belgium)
|
||||
9: {
|
||||
0: "en",
|
||||
1: "en",
|
||||
3: "en-au",
|
||||
40: "en-bz",
|
||||
4: "en-ca",
|
||||
6: "en-ie",
|
||||
8: "en-jm",
|
||||
5: "en-nz",
|
||||
13: "en-ph",
|
||||
7: "en-za",
|
||||
11: "en-tt",
|
||||
2: "en-gb",
|
||||
1: "en-us",
|
||||
12: "en-zw",
|
||||
},
|
||||
# English, English (Australia), English (Belize), English (Canada),
|
||||
# English (Ireland), English (Jamaica), English (New Zealand), English
|
||||
# (Philippines), English (South Africa), English (Trinidad), English
|
||||
# (United Kingdom), English (United States), English (Zimbabwe)
|
||||
37: {0: "et"}, # Estonian
|
||||
56: {0: "fo"}, # Faroese
|
||||
41: {0: "fa"}, # Farsi / Persian
|
||||
11: {0: "fi"}, # Finnish
|
||||
12: {
|
||||
0: "fr",
|
||||
1: "fr",
|
||||
2: "fr-be",
|
||||
3: "fr-ca",
|
||||
5: "fr-lu",
|
||||
6: "fr-mc",
|
||||
4: "fr-ch",
|
||||
},
|
||||
# French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland)
|
||||
55: {0: "ka"}, # Georgian
|
||||
7: {0: "de", 1: "de", 3: "de-at", 5: "de-li", 4: "de-lu", 2: "de-ch"},
|
||||
# German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland)
|
||||
8: {0: "el"}, # Greek, Modern (1453-)
|
||||
71: {0: "gu"}, # Gujarati
|
||||
13: {0: "he"}, # Hebrew (also code 'iw'?)
|
||||
57: {0: "hi"}, # Hindi
|
||||
14: {0: "hu"}, # Hungarian
|
||||
15: {0: "is"}, # Icelandic
|
||||
33: {0: "id"}, # Indonesian
|
||||
16: {0: "it", 1: "it", 2: "it-ch"}, # Italian, Italian (Switzerland)
|
||||
17: {0: "ja"}, # Japanese
|
||||
75: {0: "kn"}, # Kannada
|
||||
63: {0: "kk"}, # Kazakh
|
||||
87: {0: "x-kok"}, # Konkani (real language code is 'kok'?)
|
||||
18: {0: "ko"}, # Korean
|
||||
38: {0: "lv"}, # Latvian
|
||||
39: {0: "lt"}, # Lithuanian
|
||||
47: {0: "mk"}, # Macedonian
|
||||
62: {0: "ms"}, # Malay
|
||||
76: {0: "ml"}, # Malayalam
|
||||
58: {0: "mt"}, # Maltese
|
||||
78: {0: "mr"}, # Marathi
|
||||
97: {0: "ne"}, # Nepali
|
||||
20: {0: "no"}, # Norwegian
|
||||
72: {0: "or"}, # Oriya
|
||||
21: {0: "pl"}, # Polish
|
||||
22: {0: "pt", 2: "pt", 1: "pt-br"}, # Portuguese, Portuguese (Brazil)
|
||||
70: {0: "pa"}, # Punjabi
|
||||
23: {0: "rm"}, # "Rhaeto-Romanic" (IANA: Romansh)
|
||||
24: {0: "ro"}, # Romanian
|
||||
25: {0: "ru"}, # Russian
|
||||
59: {0: "sz"}, # "Sami (Lappish)" (not an IANA language code)
|
||||
# IANA code for "Northern Sami" is 'se'
|
||||
# 'SZ' is the IANA region code for Swaziland
|
||||
79: {0: "sa"}, # Sanskrit
|
||||
27: {0: "sk"}, # Slovak
|
||||
36: {0: "sl"}, # Slovenian
|
||||
46: {0: "sb"}, # "Sorbian" (not an IANA language code)
|
||||
# 'SB' is IANA region code for 'Solomon Islands'
|
||||
# Lower Sorbian = 'dsb'
|
||||
# Upper Sorbian = 'hsb'
|
||||
# Sorbian Languages = 'wen'
|
||||
10: {
|
||||
0: "es",
|
||||
4: "es",
|
||||
44: "es-ar",
|
||||
64: "es-bo",
|
||||
52: "es-cl",
|
||||
36: "es-co",
|
||||
20: "es-cr",
|
||||
28: "es-do",
|
||||
48: "es-ec",
|
||||
68: "es-sv",
|
||||
16: "es-gt",
|
||||
72: "es-hn",
|
||||
8: "es-mx",
|
||||
76: "es-ni",
|
||||
24: "es-pa",
|
||||
60: "es-py",
|
||||
40: "es-pe",
|
||||
80: "es-pr",
|
||||
56: "es-uy",
|
||||
32: "es-ve",
|
||||
},
|
||||
# Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish
|
||||
# (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica),
|
||||
# Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El
|
||||
# Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico),
|
||||
# Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish
|
||||
# (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela)
|
||||
48: {0: "sx"}, # "Sutu" (not an IANA language code)
|
||||
# "Sutu" is another name for "Southern Sotho"?
|
||||
# IANA code for "Southern Sotho" is 'st'
|
||||
65: {0: "sw"}, # Swahili
|
||||
29: {0: "sv", 1: "sv", 8: "sv-fi"}, # Swedish, Swedish (Finland)
|
||||
73: {0: "ta"}, # Tamil
|
||||
68: {0: "tt"}, # Tatar
|
||||
74: {0: "te"}, # Telugu
|
||||
30: {0: "th"}, # Thai
|
||||
49: {0: "ts"}, # Tsonga
|
||||
50: {0: "tn"}, # Tswana
|
||||
31: {0: "tr"}, # Turkish
|
||||
34: {0: "uk"}, # Ukrainian
|
||||
32: {0: "ur"}, # Urdu
|
||||
67: {0: "uz", 2: "uz"}, # Uzbek
|
||||
42: {0: "vi"}, # Vietnamese
|
||||
52: {0: "xh"}, # Xhosa
|
||||
53: {0: "zu"}, # Zulu
|
||||
}
|
||||
lang = "en"
|
||||
if langID in mobilangdict:
|
||||
subdict = mobilangdict[langID]
|
||||
lang = subdict[0]
|
||||
if sublangID in subdict:
|
||||
lang = subdict[sublangID]
|
||||
return lang
|
||||
|
||||
|
||||
def toHex(byteList):
|
||||
return binascii.hexlify(byteList)
|
||||
|
||||
|
||||
# returns base32 bytestring
|
||||
def toBase32(value, npad=4):
|
||||
digits = b"0123456789ABCDEFGHIJKLMNOPQRSTUV"
|
||||
num_string = b""
|
||||
current = value
|
||||
while current != 0:
|
||||
next, remainder = divmod(current, 32)
|
||||
rem_string = digits[remainder : remainder + 1]
|
||||
num_string = rem_string + num_string
|
||||
current = next
|
||||
if num_string == b"":
|
||||
num_string = b"0"
|
||||
pad = npad - len(num_string)
|
||||
if pad > 0:
|
||||
num_string = b"0" * pad + num_string
|
||||
return num_string
|
||||
|
||||
|
||||
# converts base32 string to value
|
||||
def fromBase32(str_num):
|
||||
if isinstance(str_num, text_type):
|
||||
str_num = str_num.encode("latin-1")
|
||||
scalelst = [1, 32, 1024, 32768, 1048576, 33554432, 1073741824, 34359738368]
|
||||
value = 0
|
||||
j = 0
|
||||
n = len(str_num)
|
||||
scale = 0
|
||||
for i in range(n):
|
||||
c = str_num[n - i - 1 : n - i]
|
||||
if c in b"0123456789":
|
||||
v = ord(c) - ord(b"0")
|
||||
else:
|
||||
v = ord(c) - ord(b"A") + 10
|
||||
if j < len(scalelst):
|
||||
scale = scalelst[j]
|
||||
else:
|
||||
scale = scale * 32
|
||||
j += 1
|
||||
if v != 0:
|
||||
value = value + (v * scale)
|
||||
return value
|
||||
|
||||
|
||||
# note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
|
||||
# in place of ascii you will get a byte to half-word or integer
|
||||
# one to one mapping of values from 0 - 255
|
||||
|
||||
|
||||
def mangle_fonts(encryption_key, data):
|
||||
if isinstance(encryption_key, text_type):
|
||||
encryption_key = encryption_key.encode("latin-1")
|
||||
crypt = data[:1024]
|
||||
key = cycle(iter(map(bord, encryption_key)))
|
||||
# encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
|
||||
encrypt = b"".join([bchr(bord(x) ^ next(key)) for x in crypt])
|
||||
return encrypt + data[1024:]
|
||||
585
mobiparse/mobi/mobiml2xhtml.py
Executable file
585
mobiparse/mobi/mobiml2xhtml.py
Executable file
@@ -0,0 +1,585 @@
|
||||
#! /usr/bin/python
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
|
||||
# this program works in concert with the output from KindleUnpack
|
||||
|
||||
"""
|
||||
Convert from Mobi ML to XHTML
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
|
||||
SPECIAL_HANDLING_TAGS = {
|
||||
"?xml": ("xmlheader", -1),
|
||||
"!--": ("comment", -3),
|
||||
"!DOCTYPE": ("doctype", -1),
|
||||
}
|
||||
|
||||
SPECIAL_HANDLING_TYPES = ["xmlheader", "doctype", "comment"]
|
||||
|
||||
SELF_CLOSING_TAGS = [
|
||||
"br",
|
||||
"hr",
|
||||
"input",
|
||||
"img",
|
||||
"image",
|
||||
"meta",
|
||||
"spacer",
|
||||
"link",
|
||||
"frame",
|
||||
"base",
|
||||
"col",
|
||||
"reference",
|
||||
]
|
||||
|
||||
|
||||
class MobiMLConverter(object):
|
||||
|
||||
PAGE_BREAK_PAT = re.compile(r"(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+", re.IGNORECASE)
|
||||
IMAGE_ATTRS = ("lowrecindex", "recindex", "hirecindex")
|
||||
|
||||
def __init__(self, filename):
|
||||
self.base_css_rules = "blockquote { margin: 0em 0em 0em 1.25em }\n"
|
||||
self.base_css_rules += "p { margin: 0em }\n"
|
||||
self.base_css_rules += ".bold { font-weight: bold }\n"
|
||||
self.base_css_rules += ".italic { font-style: italic }\n"
|
||||
self.base_css_rules += (
|
||||
".mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n"
|
||||
)
|
||||
self.tag_css_rules = {}
|
||||
self.tag_css_rule_cnt = 0
|
||||
self.path = []
|
||||
self.filename = filename
|
||||
self.wipml = open(self.filename, "rb").read()
|
||||
self.pos = 0
|
||||
self.opfname = self.filename.rsplit(".", 1)[0] + ".opf"
|
||||
self.opos = 0
|
||||
self.meta = ""
|
||||
self.cssname = os.path.join(os.path.dirname(self.filename), "styles.css")
|
||||
self.current_font_size = 3
|
||||
self.font_history = []
|
||||
|
||||
def cleanup_html(self):
|
||||
self.wipml = re.sub(
|
||||
r'<div height="0(pt|px|ex|em|%){0,1}"></div>', "", self.wipml
|
||||
)
|
||||
self.wipml = self.wipml.replace("\r\n", "\n")
|
||||
self.wipml = self.wipml.replace("> <", ">\n<")
|
||||
self.wipml = self.wipml.replace("<mbp: ", "<mbp:")
|
||||
# self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
|
||||
self.wipml = self.wipml.replace("<br></br>", "<br/>")
|
||||
|
||||
def replace_page_breaks(self):
|
||||
self.wipml = self.PAGE_BREAK_PAT.sub(
|
||||
'<div class="mbp_pagebreak" />', self.wipml
|
||||
)
|
||||
|
||||
# parse leading text of ml and tag
|
||||
def parseml(self):
|
||||
p = self.pos
|
||||
if p >= len(self.wipml):
|
||||
return None
|
||||
if self.wipml[p] != "<":
|
||||
res = self.wipml.find("<", p)
|
||||
if res == -1:
|
||||
res = len(self.wipml)
|
||||
self.pos = res
|
||||
return self.wipml[p:res], None
|
||||
# handle comment as a special case to deal with multi-line comments
|
||||
if self.wipml[p : p + 4] == "<!--":
|
||||
te = self.wipml.find("-->", p + 1)
|
||||
if te != -1:
|
||||
te = te + 2
|
||||
else:
|
||||
te = self.wipml.find(">", p + 1)
|
||||
ntb = self.wipml.find("<", p + 1)
|
||||
if ntb != -1 and ntb < te:
|
||||
self.pos = ntb
|
||||
return self.wipml[p:ntb], None
|
||||
self.pos = te + 1
|
||||
return None, self.wipml[p : te + 1]
|
||||
|
||||
# parses string version of tag to identify its name,
|
||||
# its type 'begin', 'end' or 'single',
|
||||
# plus build a hashtable of its attributes
|
||||
# code is written to handle the possiblity of very poor formating
|
||||
def parsetag(self, s):
|
||||
p = 1
|
||||
# get the tag name
|
||||
tname = None
|
||||
ttype = None
|
||||
tattr = {}
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
if s[p : p + 1] == "/":
|
||||
ttype = "end"
|
||||
p += 1
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
b = p
|
||||
while s[p : p + 1] not in (">", "/", " ", '"', "'", "\r", "\n"):
|
||||
p += 1
|
||||
tname = s[b:p].lower()
|
||||
if tname == "!doctype":
|
||||
tname = "!DOCTYPE"
|
||||
# special cases
|
||||
if tname in SPECIAL_HANDLING_TAGS.keys():
|
||||
ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
|
||||
tattr["special"] = s[p:backstep]
|
||||
if ttype is None:
|
||||
# parse any attributes
|
||||
while s.find("=", p) != -1:
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
b = p
|
||||
while s[p : p + 1] != "=":
|
||||
p += 1
|
||||
aname = s[b:p].lower()
|
||||
aname = aname.rstrip(" ")
|
||||
p += 1
|
||||
while s[p : p + 1] == " ":
|
||||
p += 1
|
||||
if s[p : p + 1] in ('"', "'"):
|
||||
p = p + 1
|
||||
b = p
|
||||
while s[p : p + 1] not in ('"', "'"):
|
||||
p += 1
|
||||
val = s[b:p]
|
||||
p += 1
|
||||
else:
|
||||
b = p
|
||||
while s[p : p + 1] not in (">", "/", " "):
|
||||
p += 1
|
||||
val = s[b:p]
|
||||
tattr[aname] = val
|
||||
# label beginning and single tags
|
||||
if ttype is None:
|
||||
ttype = "begin"
|
||||
if s.find(" /", p) >= 0:
|
||||
ttype = "single_ext"
|
||||
elif s.find("/", p) >= 0:
|
||||
ttype = "single"
|
||||
return ttype, tname, tattr
|
||||
|
||||
# main routine to convert from mobi markup language to html
|
||||
def processml(self):
|
||||
|
||||
# are these really needed
|
||||
html_done = False
|
||||
head_done = False
|
||||
body_done = False
|
||||
|
||||
skip = False
|
||||
|
||||
htmlstr = ""
|
||||
self.replace_page_breaks()
|
||||
self.cleanup_html()
|
||||
|
||||
# now parse the cleaned up ml into standard xhtml
|
||||
while True:
|
||||
|
||||
r = self.parseml()
|
||||
if not r:
|
||||
break
|
||||
|
||||
text, tag = r
|
||||
|
||||
if text:
|
||||
if not skip:
|
||||
htmlstr += text
|
||||
|
||||
if tag:
|
||||
ttype, tname, tattr = self.parsetag(tag)
|
||||
|
||||
# If we run into a DTD or xml declarations inside the body ... bail.
|
||||
if (
|
||||
tname in SPECIAL_HANDLING_TAGS.keys()
|
||||
and tname != "comment"
|
||||
and body_done
|
||||
):
|
||||
htmlstr += "\n</body></html>"
|
||||
break
|
||||
|
||||
# make sure self-closing tags actually self-close
|
||||
if ttype == "begin" and tname in SELF_CLOSING_TAGS:
|
||||
ttype = "single"
|
||||
|
||||
# make sure any end tags of self-closing tags are discarded
|
||||
if ttype == "end" and tname in SELF_CLOSING_TAGS:
|
||||
continue
|
||||
|
||||
# remove embedded guide and refernces from old mobis
|
||||
if tname in ("guide", "ncx", "reference") and ttype in (
|
||||
"begin",
|
||||
"single",
|
||||
"single_ext",
|
||||
):
|
||||
tname = "removeme:{0}".format(tname)
|
||||
tattr = None
|
||||
if (
|
||||
tname in ("guide", "ncx", "reference", "font", "span")
|
||||
and ttype == "end"
|
||||
):
|
||||
if self.path[-1] == "removeme:{0}".format(tname):
|
||||
tname = "removeme:{0}".format(tname)
|
||||
tattr = None
|
||||
|
||||
# Get rid of font tags that only have a color attribute.
|
||||
if tname == "font" and ttype in ("begin", "single", "single_ext"):
|
||||
if "color" in tattr.keys() and len(tattr.keys()) == 1:
|
||||
tname = "removeme:{0}".format(tname)
|
||||
tattr = None
|
||||
|
||||
# Get rid of empty spans in the markup.
|
||||
if (
|
||||
tname == "span"
|
||||
and ttype in ("begin", "single", "single_ext")
|
||||
and not len(tattr)
|
||||
):
|
||||
tname = "removeme:{0}".format(tname)
|
||||
|
||||
# need to handle fonts outside of the normal methods
|
||||
# so fonts tags won't be added to the self.path since we keep track
|
||||
# of font tags separately with self.font_history
|
||||
if tname == "font" and ttype == "begin":
|
||||
# check for nested font start tags
|
||||
if len(self.font_history) > 0:
|
||||
# inject a font end tag
|
||||
taginfo = ("end", "font", None)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
self.font_history.append((ttype, tname, tattr))
|
||||
# handle the current font start tag
|
||||
taginfo = (ttype, tname, tattr)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
continue
|
||||
|
||||
# check for nested font tags and unnest them
|
||||
if tname == "font" and ttype == "end":
|
||||
self.font_history.pop()
|
||||
# handle this font end tag
|
||||
taginfo = ("end", "font", None)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
# check if we were nested
|
||||
if len(self.font_history) > 0:
|
||||
# inject a copy of the most recent font start tag from history
|
||||
taginfo = self.font_history[-1]
|
||||
htmlstr += self.processtag(taginfo)
|
||||
continue
|
||||
|
||||
# keep track of nesting path
|
||||
if ttype == "begin":
|
||||
self.path.append(tname)
|
||||
elif ttype == "end":
|
||||
if tname != self.path[-1]:
|
||||
print ("improper nesting: ", self.path, tname, ttype)
|
||||
if tname not in self.path:
|
||||
# handle case of end tag with no beginning by injecting empty begin tag
|
||||
taginfo = ("begin", tname, None)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
print " - fixed by injecting empty start tag ", tname
|
||||
self.path.append(tname)
|
||||
elif len(self.path) > 1 and tname == self.path[-2]:
|
||||
# handle case of dangling missing end
|
||||
taginfo = ("end", self.path[-1], None)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
print " - fixed by injecting end tag ", self.path[-1]
|
||||
self.path.pop()
|
||||
self.path.pop()
|
||||
|
||||
if tname == "removeme:{0}".format(tname):
|
||||
if ttype in ("begin", "single", "single_ext"):
|
||||
skip = True
|
||||
else:
|
||||
skip = False
|
||||
else:
|
||||
taginfo = (ttype, tname, tattr)
|
||||
htmlstr += self.processtag(taginfo)
|
||||
|
||||
# handle potential issue of multiple html, head, and body sections
|
||||
if tname == "html" and ttype == "begin" and not html_done:
|
||||
htmlstr += "\n"
|
||||
html_done = True
|
||||
|
||||
if tname == "head" and ttype == "begin" and not head_done:
|
||||
htmlstr += "\n"
|
||||
# also add in metadata and style link tags
|
||||
htmlstr += self.meta
|
||||
htmlstr += (
|
||||
'<link href="styles.css" rel="stylesheet" type="text/css" />\n'
|
||||
)
|
||||
head_done = True
|
||||
|
||||
if tname == "body" and ttype == "begin" and not body_done:
|
||||
htmlstr += "\n"
|
||||
body_done = True
|
||||
|
||||
# handle issue of possibly missing html, head, and body tags
|
||||
# I have not seen this but the original did something like this so ...
|
||||
if not body_done:
|
||||
htmlstr = "<body>\n" + htmlstr + "</body>\n"
|
||||
if not head_done:
|
||||
headstr = "<head>\n"
|
||||
headstr += self.meta
|
||||
headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
|
||||
headstr += "</head>\n"
|
||||
htmlstr = headstr + htmlstr
|
||||
if not html_done:
|
||||
htmlstr = "<html>\n" + htmlstr + "</html>\n"
|
||||
|
||||
# finally add DOCTYPE info
|
||||
htmlstr = (
|
||||
'<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
|
||||
+ htmlstr
|
||||
)
|
||||
|
||||
css = self.base_css_rules
|
||||
for cls, rule in self.tag_css_rules.items():
|
||||
css += ".%s { %s }\n" % (cls, rule)
|
||||
|
||||
return (htmlstr, css, self.cssname)
|
||||
|
||||
def ensure_unit(self, raw, unit="px"):
|
||||
if re.search(r"\d+$", raw) is not None:
|
||||
raw += unit
|
||||
return raw
|
||||
|
||||
# flatten possibly modified tag back to string
|
||||
def taginfo_tostring(self, taginfo):
|
||||
(ttype, tname, tattr) = taginfo
|
||||
if ttype is None or tname is None:
|
||||
return ""
|
||||
if ttype == "end":
|
||||
return "</%s>" % tname
|
||||
if (
|
||||
ttype in SPECIAL_HANDLING_TYPES
|
||||
and tattr is not None
|
||||
and "special" in tattr.keys()
|
||||
):
|
||||
info = tattr["special"]
|
||||
if ttype == "comment":
|
||||
return "<%s %s-->" % tname, info
|
||||
else:
|
||||
return "<%s %s>" % tname, info
|
||||
res = []
|
||||
res.append("<%s" % tname)
|
||||
if tattr is not None:
|
||||
for key in tattr.keys():
|
||||
res.append(' %s="%s"' % (key, tattr[key]))
|
||||
if ttype == "single":
|
||||
res.append("/>")
|
||||
elif ttype == "single_ext":
|
||||
res.append(" />")
|
||||
else:
|
||||
res.append(">")
|
||||
return "".join(res)
|
||||
|
||||
# routines to convert from mobi ml tags atributes to xhtml attributes and styles
|
||||
def processtag(self, taginfo):
|
||||
# Converting mobi font sizes to numerics
|
||||
size_map = {
|
||||
"xx-small": "1",
|
||||
"x-small": "2",
|
||||
"small": "3",
|
||||
"medium": "4",
|
||||
"large": "5",
|
||||
"x-large": "6",
|
||||
"xx-large": "7",
|
||||
}
|
||||
|
||||
size_to_em_map = {
|
||||
"1": ".65em",
|
||||
"2": ".75em",
|
||||
"3": "1em",
|
||||
"4": "1.125em",
|
||||
"5": "1.25em",
|
||||
"6": "1.5em",
|
||||
"7": "2em",
|
||||
}
|
||||
|
||||
# current tag to work on
|
||||
(ttype, tname, tattr) = taginfo
|
||||
if not tattr:
|
||||
tattr = {}
|
||||
|
||||
styles = []
|
||||
|
||||
if tname is None or tname.startswith("removeme"):
|
||||
return ""
|
||||
|
||||
# have not seen an example of this yet so keep it here to be safe
|
||||
# until this is better understood
|
||||
if tname in (
|
||||
"country-region",
|
||||
"place",
|
||||
"placetype",
|
||||
"placename",
|
||||
"state",
|
||||
"city",
|
||||
"street",
|
||||
"address",
|
||||
"content",
|
||||
):
|
||||
tname = "div" if tname == "content" else "span"
|
||||
for key in tattr.keys():
|
||||
tattr.pop(key)
|
||||
|
||||
# handle general case of style, height, width, bgcolor in any tag
|
||||
if "style" in tattr.keys():
|
||||
style = tattr.pop("style").strip()
|
||||
if style:
|
||||
styles.append(style)
|
||||
|
||||
if "align" in tattr.keys():
|
||||
align = tattr.pop("align").strip()
|
||||
if align:
|
||||
if tname in ("table", "td", "tr"):
|
||||
pass
|
||||
else:
|
||||
styles.append("text-align: %s" % align)
|
||||
|
||||
if "height" in tattr.keys():
|
||||
height = tattr.pop("height").strip()
|
||||
if (
|
||||
height
|
||||
and "<" not in height
|
||||
and ">" not in height
|
||||
and re.search(r"\d+", height)
|
||||
):
|
||||
if tname in ("table", "td", "tr"):
|
||||
pass
|
||||
elif tname == "img":
|
||||
tattr["height"] = height
|
||||
else:
|
||||
styles.append("margin-top: %s" % self.ensure_unit(height))
|
||||
|
||||
if "width" in tattr.keys():
|
||||
width = tattr.pop("width").strip()
|
||||
if width and re.search(r"\d+", width):
|
||||
if tname in ("table", "td", "tr"):
|
||||
pass
|
||||
elif tname == "img":
|
||||
tattr["width"] = width
|
||||
else:
|
||||
styles.append("text-indent: %s" % self.ensure_unit(width))
|
||||
if width.startswith("-"):
|
||||
styles.append("margin-left: %s" % self.ensure_unit(width[1:]))
|
||||
|
||||
if "bgcolor" in tattr.keys():
|
||||
# no proprietary html allowed
|
||||
if tname == "div":
|
||||
del tattr["bgcolor"]
|
||||
|
||||
elif tname == "font":
|
||||
# Change font tags to span tags
|
||||
tname = "span"
|
||||
if ttype in ("begin", "single", "single_ext"):
|
||||
# move the face attribute to css font-family
|
||||
if "face" in tattr.keys():
|
||||
face = tattr.pop("face").strip()
|
||||
styles.append('font-family: "%s"' % face)
|
||||
|
||||
# Monitor the constantly changing font sizes, change them to ems and move
|
||||
# them to css. The following will work for 'flat' font tags, but nested font tags
|
||||
# will cause things to go wonky. Need to revert to the parent font tag's size
|
||||
# when a closing tag is encountered.
|
||||
if "size" in tattr.keys():
|
||||
sz = tattr.pop("size").strip().lower()
|
||||
try:
|
||||
float(sz)
|
||||
except ValueError:
|
||||
if sz in size_map.keys():
|
||||
sz = size_map[sz]
|
||||
else:
|
||||
if sz.startswith("-") or sz.startswith("+"):
|
||||
sz = self.current_font_size + float(sz)
|
||||
if sz > 7:
|
||||
sz = 7
|
||||
elif sz < 1:
|
||||
sz = 1
|
||||
sz = str(int(sz))
|
||||
styles.append("font-size: %s" % size_to_em_map[sz])
|
||||
self.current_font_size = int(sz)
|
||||
|
||||
elif tname == "img":
|
||||
for attr in ("width", "height"):
|
||||
if attr in tattr:
|
||||
val = tattr[attr]
|
||||
if val.lower().endswith("em"):
|
||||
try:
|
||||
nval = float(val[:-2])
|
||||
nval *= 16 * (
|
||||
168.451 / 72
|
||||
) # Assume this was set using the Kindle profile
|
||||
tattr[attr] = "%dpx" % int(nval)
|
||||
except:
|
||||
del tattr[attr]
|
||||
elif val.lower().endswith("%"):
|
||||
del tattr[attr]
|
||||
|
||||
# convert the anchor tags
|
||||
if "filepos-id" in tattr:
|
||||
tattr["id"] = tattr.pop("filepos-id")
|
||||
if "name" in tattr and tattr["name"] != tattr["id"]:
|
||||
tattr["name"] = tattr["id"]
|
||||
|
||||
if "filepos" in tattr:
|
||||
filepos = tattr.pop("filepos")
|
||||
try:
|
||||
tattr["href"] = "#filepos%d" % int(filepos)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if styles:
|
||||
ncls = None
|
||||
rule = "; ".join(styles)
|
||||
for sel, srule in self.tag_css_rules.items():
|
||||
if srule == rule:
|
||||
ncls = sel
|
||||
break
|
||||
if ncls is None:
|
||||
self.tag_css_rule_cnt += 1
|
||||
ncls = "rule_%d" % self.tag_css_rule_cnt
|
||||
self.tag_css_rules[ncls] = rule
|
||||
cls = tattr.get("class", "")
|
||||
cls = cls + (" " if cls else "") + ncls
|
||||
tattr["class"] = cls
|
||||
|
||||
# convert updated tag back to string representation
|
||||
if len(tattr) == 0:
|
||||
tattr = None
|
||||
taginfo = (ttype, tname, tattr)
|
||||
return self.taginfo_tostring(taginfo)
|
||||
|
||||
|
||||
""" main only left in for testing outside of plugin """
|
||||
|
||||
|
||||
def main(argv=sys.argv):
|
||||
if len(argv) != 2:
|
||||
return 1
|
||||
else:
|
||||
infile = argv[1]
|
||||
|
||||
try:
|
||||
print "Converting Mobi Markup Language to XHTML"
|
||||
mlc = MobiMLConverter(infile)
|
||||
print "Processing ..."
|
||||
htmlstr, css, cssname = mlc.processml()
|
||||
outname = infile.rsplit(".", 1)[0] + "_converted.html"
|
||||
file(outname, "wb").write(htmlstr)
|
||||
file(cssname, "wb").write(css)
|
||||
print "Completed"
|
||||
print "XHTML version of book can be found at: " + outname
|
||||
|
||||
except ValueError, e:
|
||||
print "Error: %s" % e
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
103
mobiparse/mobi/unipath.py
Executable file
103
mobiparse/mobi/unipath.py
Executable file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
# Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice, this list
|
||||
# of conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||
# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
||||
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
from .compatibility_utils import PY2, text_type, binary_type
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# utility routines to convert all paths to be full unicode
|
||||
|
||||
# Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding
|
||||
# Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it
|
||||
|
||||
# Mac OS X and Windows will happily support full unicode paths
|
||||
# Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode
|
||||
|
||||
fsencoding = sys.getfilesystemencoding()
|
||||
|
||||
|
||||
def pathof(s, enc=fsencoding):
|
||||
if s is None:
|
||||
return None
|
||||
if isinstance(s, text_type):
|
||||
return s
|
||||
if isinstance(s, binary_type):
|
||||
try:
|
||||
return s.decode(enc)
|
||||
except:
|
||||
pass
|
||||
return s
|
||||
|
||||
|
||||
def exists(s):
|
||||
return os.path.exists(pathof(s))
|
||||
|
||||
|
||||
def isfile(s):
|
||||
return os.path.isfile(pathof(s))
|
||||
|
||||
|
||||
def isdir(s):
|
||||
return os.path.isdir(pathof(s))
|
||||
|
||||
|
||||
def mkdir(s):
|
||||
return os.mkdir(pathof(s))
|
||||
|
||||
|
||||
def listdir(s):
|
||||
rv = []
|
||||
for file in os.listdir(pathof(s)):
|
||||
rv.append(pathof(file))
|
||||
return rv
|
||||
|
||||
|
||||
def getcwd():
|
||||
if PY2:
|
||||
return os.getcwdu()
|
||||
return os.getcwd()
|
||||
|
||||
|
||||
def walk(top):
|
||||
top = pathof(top)
|
||||
rv = []
|
||||
for base, dnames, names in os.walk(top):
|
||||
base = pathof(base)
|
||||
for name in names:
|
||||
name = pathof(name)
|
||||
rv.append(relpath(os.path.join(base, name), top))
|
||||
return rv
|
||||
|
||||
|
||||
def relpath(path, start=None):
|
||||
return os.path.relpath(pathof(path), pathof(start))
|
||||
|
||||
|
||||
def abspath(path):
|
||||
return os.path.abspath(pathof(path))
|
||||
175
mobiparse/mobi/unpack_structure.py
Executable file
175
mobiparse/mobi/unpack_structure.py
Executable file
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import text_type
|
||||
|
||||
from . import unipath
|
||||
from .unipath import pathof
|
||||
|
||||
DUMP = False
|
||||
""" Set to True to dump all possible information. """
|
||||
|
||||
import os
|
||||
|
||||
import re
|
||||
|
||||
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||
# but u"" is not allowed for the pattern itself only b""
|
||||
|
||||
import zipfile
|
||||
import binascii
|
||||
from .mobi_utils import mangle_fonts
|
||||
|
||||
|
||||
class unpackException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ZipInfo(zipfile.ZipInfo):
|
||||
def __init__(self, *args, **kwargs):
|
||||
if "compress_type" in kwargs:
|
||||
compress_type = kwargs.pop("compress_type")
|
||||
super(ZipInfo, self).__init__(*args, **kwargs)
|
||||
self.compress_type = compress_type
|
||||
|
||||
|
||||
class fileNames:
|
||||
def __init__(self, infile, outdir):
|
||||
self.infile = infile
|
||||
self.outdir = outdir
|
||||
if not unipath.exists(self.outdir):
|
||||
unipath.mkdir(self.outdir)
|
||||
self.mobi7dir = os.path.join(self.outdir, "mobi7")
|
||||
if not unipath.exists(self.mobi7dir):
|
||||
unipath.mkdir(self.mobi7dir)
|
||||
self.imgdir = os.path.join(self.mobi7dir, "Images")
|
||||
if not unipath.exists(self.imgdir):
|
||||
unipath.mkdir(self.imgdir)
|
||||
self.hdimgdir = os.path.join(self.outdir, "HDImages")
|
||||
if not unipath.exists(self.hdimgdir):
|
||||
unipath.mkdir(self.hdimgdir)
|
||||
self.outbase = os.path.join(
|
||||
self.outdir, os.path.splitext(os.path.split(infile)[1])[0]
|
||||
)
|
||||
|
||||
def getInputFileBasename(self):
|
||||
return os.path.splitext(os.path.basename(self.infile))[0]
|
||||
|
||||
def makeK8Struct(self):
|
||||
self.k8dir = os.path.join(self.outdir, "mobi8")
|
||||
if not unipath.exists(self.k8dir):
|
||||
unipath.mkdir(self.k8dir)
|
||||
self.k8metainf = os.path.join(self.k8dir, "META-INF")
|
||||
if not unipath.exists(self.k8metainf):
|
||||
unipath.mkdir(self.k8metainf)
|
||||
self.k8oebps = os.path.join(self.k8dir, "OEBPS")
|
||||
if not unipath.exists(self.k8oebps):
|
||||
unipath.mkdir(self.k8oebps)
|
||||
self.k8images = os.path.join(self.k8oebps, "Images")
|
||||
if not unipath.exists(self.k8images):
|
||||
unipath.mkdir(self.k8images)
|
||||
self.k8fonts = os.path.join(self.k8oebps, "Fonts")
|
||||
if not unipath.exists(self.k8fonts):
|
||||
unipath.mkdir(self.k8fonts)
|
||||
self.k8styles = os.path.join(self.k8oebps, "Styles")
|
||||
if not unipath.exists(self.k8styles):
|
||||
unipath.mkdir(self.k8styles)
|
||||
self.k8text = os.path.join(self.k8oebps, "Text")
|
||||
if not unipath.exists(self.k8text):
|
||||
unipath.mkdir(self.k8text)
|
||||
|
||||
# recursive zip creation support routine
|
||||
def zipUpDir(self, myzip, tdir, localname):
|
||||
currentdir = tdir
|
||||
if localname != "":
|
||||
currentdir = os.path.join(currentdir, localname)
|
||||
list = unipath.listdir(currentdir)
|
||||
for file in list:
|
||||
afilename = file
|
||||
localfilePath = os.path.join(localname, afilename)
|
||||
realfilePath = os.path.join(currentdir, file)
|
||||
if unipath.isfile(realfilePath):
|
||||
myzip.write(
|
||||
pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED
|
||||
)
|
||||
elif unipath.isdir(realfilePath):
|
||||
self.zipUpDir(myzip, tdir, localfilePath)
|
||||
|
||||
def makeEPUB(self, usedmap, obfuscate_data, uid):
|
||||
bname = os.path.join(self.k8dir, self.getInputFileBasename() + ".epub")
|
||||
# Create an encryption key for Adobe font obfuscation
|
||||
# based on the epub's uid
|
||||
if isinstance(uid, text_type):
|
||||
uid = uid.encode("ascii")
|
||||
if obfuscate_data:
|
||||
key = re.sub(br"[^a-fA-F0-9]", b"", uid)
|
||||
key = binascii.unhexlify((key + key)[:32])
|
||||
|
||||
# copy over all images and fonts that are actually used in the ebook
|
||||
# and remove all font files from mobi7 since not supported
|
||||
imgnames = unipath.listdir(self.imgdir)
|
||||
for name in imgnames:
|
||||
if usedmap.get(name, "not used") == "used":
|
||||
filein = os.path.join(self.imgdir, name)
|
||||
if name.endswith(".ttf"):
|
||||
fileout = os.path.join(self.k8fonts, name)
|
||||
elif name.endswith(".otf"):
|
||||
fileout = os.path.join(self.k8fonts, name)
|
||||
elif name.endswith(".failed"):
|
||||
fileout = os.path.join(self.k8fonts, name)
|
||||
else:
|
||||
fileout = os.path.join(self.k8images, name)
|
||||
data = b""
|
||||
with open(pathof(filein), "rb") as f:
|
||||
data = f.read()
|
||||
if obfuscate_data:
|
||||
if name in obfuscate_data:
|
||||
data = mangle_fonts(key, data)
|
||||
open(pathof(fileout), "wb").write(data)
|
||||
if name.endswith(".ttf") or name.endswith(".otf"):
|
||||
os.remove(pathof(filein))
|
||||
|
||||
# opf file name hard coded to "content.opf"
|
||||
container = '<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||
container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
|
||||
container += " <rootfiles>\n"
|
||||
container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>'
|
||||
container += " </rootfiles>\n</container>\n"
|
||||
fileout = os.path.join(self.k8metainf, "container.xml")
|
||||
with open(pathof(fileout), "wb") as f:
|
||||
f.write(container.encode("utf-8"))
|
||||
|
||||
if obfuscate_data:
|
||||
encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \
|
||||
xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n'
|
||||
for font in obfuscate_data:
|
||||
encryption += " <enc:EncryptedData>\n"
|
||||
encryption += ' <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n'
|
||||
encryption += " <enc:CipherData>\n"
|
||||
encryption += (
|
||||
' <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n'
|
||||
)
|
||||
encryption += " </enc:CipherData>\n"
|
||||
encryption += " </enc:EncryptedData>\n"
|
||||
encryption += "</encryption>\n"
|
||||
fileout = os.path.join(self.k8metainf, "encryption.xml")
|
||||
with open(pathof(fileout), "wb") as f:
|
||||
f.write(encryption.encode("utf-8"))
|
||||
|
||||
# ready to build epub
|
||||
self.outzip = zipfile.ZipFile(pathof(bname), "w")
|
||||
|
||||
# add the mimetype file uncompressed
|
||||
mimetype = b"application/epub+zip"
|
||||
fileout = os.path.join(self.k8dir, "mimetype")
|
||||
with open(pathof(fileout), "wb") as f:
|
||||
f.write(mimetype)
|
||||
nzinfo = ZipInfo("mimetype", compress_type=zipfile.ZIP_STORED)
|
||||
nzinfo.external_attr = 0o600 << 16 # make this a normal file
|
||||
self.outzip.writestr(nzinfo, mimetype)
|
||||
self.zipUpDir(self.outzip, self.k8dir, "META-INF")
|
||||
self.zipUpDir(self.outzip, self.k8dir, "OEBPS")
|
||||
self.outzip.close()
|
||||
21
mobiparse/mobi/x
Normal file
21
mobiparse/mobi/x
Normal file
@@ -0,0 +1,21 @@
|
||||
# KF8 (Mobi 8)
|
||||
if mh.isK8():
|
||||
processMobi8(
|
||||
mh,
|
||||
metadata,
|
||||
sect,
|
||||
files,
|
||||
rscnames,
|
||||
pagemapproc,
|
||||
k8resc,
|
||||
obfuscate_data,
|
||||
apnxfile,
|
||||
epubver,
|
||||
)
|
||||
|
||||
# Old Mobi (Mobi 7)
|
||||
elif not k8only:
|
||||
processMobi7(mh, metadata, sect, files, rscnames)
|
||||
|
||||
# CGDBG
|
||||
print('k8only {} mh.isK8() {}'.format(k8only, mh.isK8()))
|
||||
Reference in New Issue
Block a user