kindle manager
This commit is contained in:
327
mobimaster/mobi/mobi_index.py
Executable file
327
mobimaster/mobi/mobi_index.py
Executable file
@@ -0,0 +1,327 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, bchr, bstr, bord
|
||||
from loguru import logger
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
from .mobi_utils import toHex
|
||||
|
||||
|
||||
class MobiIndex:
|
||||
# CGDBG
|
||||
def __init__(self, sect, DEBUG=True):
|
||||
self.sect = sect
|
||||
self.DEBUG = DEBUG
|
||||
|
||||
def getIndexData(self, idx, label="Unknown"):
|
||||
sect = self.sect
|
||||
outtbl = []
|
||||
ctoc_text = {}
|
||||
if idx != 0xFFFFFFFF:
|
||||
sect.setsectiondescription(idx, "{0} Main INDX section".format(label))
|
||||
data = sect.loadSection(idx)
|
||||
idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
|
||||
IndexCount = idxhdr["count"]
|
||||
# handle the case of multiple sections used for CTOC
|
||||
rec_off = 0
|
||||
off = idx + IndexCount + 1
|
||||
for j in range(idxhdr["nctoc"]):
|
||||
cdata = sect.loadSection(off + j)
|
||||
sect.setsectiondescription(off + j, label + " CTOC Data " + str(j))
|
||||
ctocdict = self.readCTOC(cdata)
|
||||
for k in ctocdict:
|
||||
ctoc_text[k + rec_off] = ctocdict[k]
|
||||
rec_off += 0x10000
|
||||
tagSectionStart = idxhdr["len"]
|
||||
controlByteCount, tagTable = readTagSection(tagSectionStart, data)
|
||||
if self.DEBUG:
|
||||
logger.debug("ControlByteCount is", controlByteCount)
|
||||
logger.debug("IndexCount is", IndexCount)
|
||||
logger.debug("TagTable: %s" % tagTable)
|
||||
for i in range(idx + 1, idx + 1 + IndexCount):
|
||||
sect.setsectiondescription(
|
||||
i, "{0} Extra {1:d} INDX section".format(label, i - idx)
|
||||
)
|
||||
data = sect.loadSection(i)
|
||||
hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
|
||||
idxtPos = hdrinfo["start"]
|
||||
entryCount = hdrinfo["count"]
|
||||
if self.DEBUG:
|
||||
logger.debug("%s %s" % (idxtPos, entryCount))
|
||||
# loop through to build up the IDXT position starts
|
||||
idxPositions = []
|
||||
for j in range(entryCount):
|
||||
(pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
|
||||
idxPositions.append(pos)
|
||||
# The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
|
||||
idxPositions.append(idxtPos)
|
||||
# for each entry in the IDXT build up the tagMap and any associated text
|
||||
for j in range(entryCount):
|
||||
startPos = idxPositions[j]
|
||||
endPos = idxPositions[j + 1]
|
||||
textLength = ord(data[startPos : startPos + 1])
|
||||
text = data[startPos + 1 : startPos + 1 + textLength]
|
||||
if hordt2 is not None:
|
||||
text = b"".join(bchr(hordt2[bord(x)]) for x in text)
|
||||
tagMap = getTagMap(
|
||||
controlByteCount,
|
||||
tagTable,
|
||||
data,
|
||||
startPos + 1 + textLength,
|
||||
endPos,
|
||||
)
|
||||
outtbl.append([text, tagMap])
|
||||
if self.DEBUG:
|
||||
# CGDBG
|
||||
logger.debug('tagMap {}'.format(tagMap))
|
||||
logger.debug('text {}'.format(text))
|
||||
logger.debug('data {}'.format(data))
|
||||
|
||||
return outtbl, ctoc_text
|
||||
|
||||
def parseINDXHeader(self, data):
|
||||
"read INDX header"
|
||||
if not data[:4] == b"INDX":
|
||||
logger.debug("Warning: index section is not INDX")
|
||||
return False
|
||||
words = (
|
||||
"len",
|
||||
"nul1",
|
||||
"type",
|
||||
"gen",
|
||||
"start",
|
||||
"count",
|
||||
"code",
|
||||
"lng",
|
||||
"total",
|
||||
"ordt",
|
||||
"ligt",
|
||||
"nligt",
|
||||
"nctoc",
|
||||
)
|
||||
num = len(words)
|
||||
values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
|
||||
header = {}
|
||||
for n in range(num):
|
||||
header[words[n]] = values[n]
|
||||
|
||||
ordt1 = None
|
||||
ordt2 = None
|
||||
|
||||
ocnt, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
|
||||
if header["code"] == 0xFDEA or ocnt != 0 or oentries > 0:
|
||||
# horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
|
||||
# them in the proper place in the header. They seem to be codepage 65002 which seems
|
||||
# to be some sort of strange EBCDIC utf-8 or 16 encoded strings
|
||||
|
||||
# so we need to look for them and store them away to process leading text
|
||||
# ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
|
||||
# we only ever seem to use the seocnd but ...
|
||||
assert ocnt == 1
|
||||
assert data[op1 : op1 + 4] == b"ORDT"
|
||||
assert data[op2 : op2 + 4] == b"ORDT"
|
||||
ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
|
||||
ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)
|
||||
|
||||
if self.DEBUG:
|
||||
logger.debug("parsed INDX header:")
|
||||
for n in words:
|
||||
print(
|
||||
n, "%X" % header[n],
|
||||
)
|
||||
logger.debug("")
|
||||
return header, ordt1, ordt2
|
||||
|
||||
def readCTOC(self, txtdata):
|
||||
# read all blocks from CTOC
|
||||
ctoc_data = {}
|
||||
offset = 0
|
||||
while offset < len(txtdata):
|
||||
if PY2:
|
||||
if txtdata[offset] == b"\0":
|
||||
break
|
||||
else:
|
||||
if txtdata[offset] == 0:
|
||||
break
|
||||
idx_offs = offset
|
||||
# first n bytes: name len as vwi
|
||||
pos, ilen = getVariableWidthValue(txtdata, offset)
|
||||
offset += pos
|
||||
# <len> next bytes: name
|
||||
name = txtdata[offset : offset + ilen]
|
||||
offset += ilen
|
||||
if self.DEBUG:
|
||||
logger.debug("name length is %s" % ilen)
|
||||
logger.debug("%s %s", (idx_offs, name))
|
||||
ctoc_data[idx_offs] = name
|
||||
return ctoc_data
|
||||
|
||||
|
||||
def getVariableWidthValue(data, offset):
|
||||
"""
|
||||
Decode variable width value from given bytes.
|
||||
|
||||
@param data: The bytes to decode.
|
||||
@param offset: The start offset into data.
|
||||
@return: Tuple of consumed bytes count and decoded value.
|
||||
"""
|
||||
value = 0
|
||||
consumed = 0
|
||||
finished = False
|
||||
while not finished:
|
||||
v = data[offset + consumed : offset + consumed + 1]
|
||||
consumed += 1
|
||||
if ord(v) & 0x80:
|
||||
finished = True
|
||||
value = (value << 7) | (ord(v) & 0x7F)
|
||||
return consumed, value
|
||||
|
||||
|
||||
def readTagSection(start, data):
|
||||
"""
|
||||
Read tag section from given data.
|
||||
|
||||
@param start: The start position in the data.
|
||||
@param data: The data to process.
|
||||
@return: Tuple of control byte count and list of tag tuples.
|
||||
"""
|
||||
controlByteCount = 0
|
||||
tags = []
|
||||
if data[start : start + 4] == b"TAGX":
|
||||
(firstEntryOffset,) = struct.unpack_from(b">L", data, start + 0x04)
|
||||
(controlByteCount,) = struct.unpack_from(b">L", data, start + 0x08)
|
||||
|
||||
# Skip the first 12 bytes already read above.
|
||||
for i in range(12, firstEntryOffset, 4):
|
||||
pos = start + i
|
||||
tags.append(
|
||||
(
|
||||
ord(data[pos : pos + 1]),
|
||||
ord(data[pos + 1 : pos + 2]),
|
||||
ord(data[pos + 2 : pos + 3]),
|
||||
ord(data[pos + 3 : pos + 4]),
|
||||
)
|
||||
)
|
||||
return controlByteCount, tags
|
||||
|
||||
|
||||
def countSetBits(value, bits=8):
|
||||
"""
|
||||
Count the set bits in the given value.
|
||||
|
||||
@param value: Integer value.
|
||||
@param bits: The number of bits of the input value (defaults to 8).
|
||||
@return: Number of set bits.
|
||||
"""
|
||||
count = 0
|
||||
for _ in range(bits):
|
||||
if value & 0x01 == 0x01:
|
||||
count += 1
|
||||
value = value >> 1
|
||||
return count
|
||||
|
||||
|
||||
def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
|
||||
"""
|
||||
Create a map of tags and values from the given byte section.
|
||||
|
||||
@param controlByteCount: The number of control bytes.
|
||||
@param tagTable: The tag table.
|
||||
@param entryData: The data to process.
|
||||
@param startPos: The starting position in entryData.
|
||||
@param endPos: The end position in entryData or None if it is unknown.
|
||||
@return: Hashmap of tag and list of values.
|
||||
"""
|
||||
tags = []
|
||||
tagHashMap = {}
|
||||
controlByteIndex = 0
|
||||
dataStart = startPos + controlByteCount
|
||||
|
||||
for tag, valuesPerEntry, mask, endFlag in tagTable:
|
||||
if endFlag == 0x01:
|
||||
controlByteIndex += 1
|
||||
continue
|
||||
cbyte = ord(
|
||||
entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
|
||||
)
|
||||
if 0:
|
||||
logger.debug(
|
||||
"Control Byte Index %0x , Control Byte Value %0x"
|
||||
% (controlByteIndex, cbyte)
|
||||
)
|
||||
|
||||
value = (
|
||||
ord(
|
||||
entryData[startPos + controlByteIndex : startPos + controlByteIndex + 1]
|
||||
)
|
||||
& mask
|
||||
)
|
||||
if value != 0:
|
||||
if value == mask:
|
||||
if countSetBits(mask) > 1:
|
||||
# If all bits of masked value are set and the mask has more than one bit, a variable width value
|
||||
# will follow after the control bytes which defines the length of bytes (NOT the value count!)
|
||||
# which will contain the corresponding variable width values.
|
||||
consumed, value = getVariableWidthValue(entryData, dataStart)
|
||||
dataStart += consumed
|
||||
tags.append((tag, None, value, valuesPerEntry))
|
||||
else:
|
||||
tags.append((tag, 1, None, valuesPerEntry))
|
||||
else:
|
||||
# Shift bits to get the masked value.
|
||||
while mask & 0x01 == 0:
|
||||
mask = mask >> 1
|
||||
value = value >> 1
|
||||
tags.append((tag, value, None, valuesPerEntry))
|
||||
for tag, valueCount, valueBytes, valuesPerEntry in tags:
|
||||
values = []
|
||||
if valueCount is not None:
|
||||
# Read valueCount * valuesPerEntry variable width values.
|
||||
for _ in range(valueCount):
|
||||
for _ in range(valuesPerEntry):
|
||||
consumed, data = getVariableWidthValue(entryData, dataStart)
|
||||
dataStart += consumed
|
||||
values.append(data)
|
||||
else:
|
||||
# Convert valueBytes to variable width values.
|
||||
totalConsumed = 0
|
||||
while totalConsumed < valueBytes:
|
||||
# Does this work for valuesPerEntry != 1?
|
||||
consumed, data = getVariableWidthValue(entryData, dataStart)
|
||||
dataStart += consumed
|
||||
totalConsumed += consumed
|
||||
values.append(data)
|
||||
if totalConsumed != valueBytes:
|
||||
logger.debug(
|
||||
"Error: Should consume %s bytes, but consumed %s"
|
||||
% (valueBytes, totalConsumed)
|
||||
)
|
||||
tagHashMap[tag] = values
|
||||
# Test that all bytes have been processed if endPos is given.
|
||||
if endPos is not None and dataStart != endPos:
|
||||
# The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
|
||||
for char in entryData[dataStart:endPos]:
|
||||
if bord(char) != 0:
|
||||
logger.debug(
|
||||
"Warning: There are unprocessed index bytes left: %s"
|
||||
% toHex(entryData[dataStart:endPos])
|
||||
)
|
||||
if 0:
|
||||
logger.debug("controlByteCount: %s" % controlByteCount)
|
||||
logger.debug("tagTable: %s" % tagTable)
|
||||
logger.debug("data: %s" % toHex(entryData[startPos:endPos]))
|
||||
logger.debug("tagHashMap: %s" % tagHashMap)
|
||||
break
|
||||
|
||||
return tagHashMap
|
||||
Reference in New Issue
Block a user