kindle manager
This commit is contained in:
473
mobiparse/mobi/mobi_dict.py
Executable file
473
mobiparse/mobi/mobi_dict.py
Executable file
@@ -0,0 +1,473 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
|
||||
from loguru import logger
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
array_format = b"B"
|
||||
if PY3:
|
||||
unichr = chr
|
||||
array_format = "B"
|
||||
|
||||
import array
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
|
||||
from .mobi_utils import toHex
|
||||
|
||||
DEBUG_DICT = True
|
||||
|
||||
class InflectionData(object):
|
||||
def __init__(self, infldatas):
|
||||
self.infldatas = infldatas
|
||||
self.starts = []
|
||||
self.counts = []
|
||||
for idata in self.infldatas:
|
||||
(start,) = struct.unpack_from(b">L", idata, 0x14)
|
||||
(count,) = struct.unpack_from(b">L", idata, 0x18)
|
||||
self.starts.append(start)
|
||||
self.counts.append(count)
|
||||
|
||||
def lookup(self, lookupvalue):
|
||||
i = 0
|
||||
rvalue = lookupvalue
|
||||
while rvalue >= self.counts[i]:
|
||||
rvalue = rvalue - self.counts[i]
|
||||
i += 1
|
||||
if i == len(self.counts):
|
||||
logger.debug("Error: Problem with multiple inflections data sections")
|
||||
return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
|
||||
return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
|
||||
|
||||
def offsets(self, value):
|
||||
rvalue, start, count, data = self.lookup(value)
|
||||
(offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue))
|
||||
if rvalue + 1 < count:
|
||||
(nextOffset,) = struct.unpack_from(
|
||||
b">H", data, start + 4 + (2 * (rvalue + 1))
|
||||
)
|
||||
else:
|
||||
nextOffset = None
|
||||
return offset, nextOffset, data
|
||||
|
||||
|
||||
class dictSupport(object):
|
||||
def __init__(self, mh, sect):
|
||||
self.mh = mh
|
||||
self.header = mh.header
|
||||
self.sect = sect
|
||||
self.metaOrthIndex = mh.metaOrthIndex
|
||||
self.metaInflIndex = mh.metaInflIndex
|
||||
|
||||
def parseHeader(self, data):
|
||||
"read INDX header"
|
||||
if not data[:4] == b"INDX":
|
||||
logger.debug("Warning: index section is not INDX")
|
||||
return False
|
||||
words = (
|
||||
"len",
|
||||
"nul1",
|
||||
"type",
|
||||
"gen",
|
||||
"start",
|
||||
"count",
|
||||
"code",
|
||||
"lng",
|
||||
"total",
|
||||
"ordt",
|
||||
"ligt",
|
||||
"nligt",
|
||||
"nctoc",
|
||||
)
|
||||
num = len(words)
|
||||
values = struct.unpack(bstr(">%dL" % num), data[4 : 4 * (num + 1)])
|
||||
header = {}
|
||||
for n in range(num):
|
||||
header[words[n]] = values[n]
|
||||
|
||||
ordt1 = None
|
||||
ordt2 = None
|
||||
|
||||
otype, oentries, op1, op2, otagx = struct.unpack_from(b">LLLLL", data, 0xA4)
|
||||
header["otype"] = otype
|
||||
header["oentries"] = oentries
|
||||
|
||||
if DEBUG_DICT:
|
||||
logger.debug(
|
||||
"otype %d, oentries %d, op1 %d, op2 %d, otagx %d"
|
||||
% (otype, oentries, op1, op2, otagx)
|
||||
)
|
||||
|
||||
if header["code"] == 0xFDEA or oentries > 0:
|
||||
# some dictionaries seem to be codepage 65002 (0xFDEA) which seems
|
||||
# to be some sort of strange EBCDIC utf-8 or 16 encoded strings
|
||||
# So we need to look for them and store them away to process leading text
|
||||
# ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
|
||||
# we only ever seem to use the second but ...
|
||||
#
|
||||
# if otype = 0, ORDT table uses 16 bit values as offsets into the table
|
||||
# if otype = 1, ORDT table uses 8 bit values as offsets inot the table
|
||||
|
||||
assert data[op1 : op1 + 4] == b"ORDT"
|
||||
assert data[op2 : op2 + 4] == b"ORDT"
|
||||
ordt1 = struct.unpack_from(bstr(">%dB" % oentries), data, op1 + 4)
|
||||
ordt2 = struct.unpack_from(bstr(">%dH" % oentries), data, op2 + 4)
|
||||
|
||||
if DEBUG_DICT:
|
||||
logger.debug("parsed INDX header:")
|
||||
for key in header:
|
||||
logger.debug(
|
||||
key, "%x" % header[key],
|
||||
)
|
||||
logger.debug("\n")
|
||||
return header, ordt1, ordt2
|
||||
|
||||
def getPositionMap(self):
|
||||
sect = self.sect
|
||||
|
||||
positionMap = {}
|
||||
|
||||
metaOrthIndex = self.metaOrthIndex
|
||||
metaInflIndex = self.metaInflIndex
|
||||
|
||||
decodeInflection = True
|
||||
if metaOrthIndex != 0xFFFFFFFF:
|
||||
logger.debug(
|
||||
"Info: Document contains orthographic index, handle as dictionary"
|
||||
)
|
||||
if metaInflIndex == 0xFFFFFFFF:
|
||||
decodeInflection = False
|
||||
else:
|
||||
metaInflIndexData = sect.loadSection(metaInflIndex)
|
||||
|
||||
logger.debug("\nParsing metaInflIndexData")
|
||||
midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
|
||||
|
||||
metaIndexCount = midxhdr["count"]
|
||||
idatas = []
|
||||
for j in range(metaIndexCount):
|
||||
idatas.append(sect.loadSection(metaInflIndex + 1 + j))
|
||||
dinfl = InflectionData(idatas)
|
||||
|
||||
inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
|
||||
tagSectionStart = midxhdr["len"]
|
||||
inflectionControlByteCount, inflectionTagTable = readTagSection(
|
||||
tagSectionStart, metaInflIndexData
|
||||
)
|
||||
if DEBUG_DICT:
|
||||
logger.debug("inflectionTagTable: %s" % inflectionTagTable)
|
||||
if self.hasTag(inflectionTagTable, 0x07):
|
||||
logger.debug(
|
||||
"Error: Dictionary uses obsolete inflection rule scheme which is not yet supported"
|
||||
)
|
||||
decodeInflection = False
|
||||
|
||||
data = sect.loadSection(metaOrthIndex)
|
||||
|
||||
logger.debug("\nParsing metaOrthIndex")
|
||||
idxhdr, hordt1, hordt2 = self.parseHeader(data)
|
||||
|
||||
tagSectionStart = idxhdr["len"]
|
||||
controlByteCount, tagTable = readTagSection(tagSectionStart, data)
|
||||
orthIndexCount = idxhdr["count"]
|
||||
logger.debug("orthIndexCount is", orthIndexCount)
|
||||
if DEBUG_DICT:
|
||||
logger.debug("orthTagTable: %s" % tagTable)
|
||||
if hordt2 is not None:
|
||||
logger.debug(
|
||||
"orth entry uses ordt2 lookup table of type ", idxhdr["otype"]
|
||||
)
|
||||
hasEntryLength = self.hasTag(tagTable, 0x02)
|
||||
if not hasEntryLength:
|
||||
logger.debug("Info: Index doesn't contain entry length tags")
|
||||
|
||||
logger.debug("Read dictionary index data")
|
||||
for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
|
||||
data = sect.loadSection(i)
|
||||
hdrinfo, ordt1, ordt2 = self.parseHeader(data)
|
||||
idxtPos = hdrinfo["start"]
|
||||
entryCount = hdrinfo["count"]
|
||||
idxPositions = []
|
||||
for j in range(entryCount):
|
||||
(pos,) = struct.unpack_from(b">H", data, idxtPos + 4 + (2 * j))
|
||||
idxPositions.append(pos)
|
||||
# The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
|
||||
idxPositions.append(idxtPos)
|
||||
for j in range(entryCount):
|
||||
startPos = idxPositions[j]
|
||||
endPos = idxPositions[j + 1]
|
||||
textLength = ord(data[startPos : startPos + 1])
|
||||
text = data[startPos + 1 : startPos + 1 + textLength]
|
||||
if hordt2 is not None:
|
||||
utext = ""
|
||||
if idxhdr["otype"] == 0:
|
||||
pattern = b">H"
|
||||
inc = 2
|
||||
else:
|
||||
pattern = b">B"
|
||||
inc = 1
|
||||
pos = 0
|
||||
while pos < textLength:
|
||||
(off,) = struct.unpack_from(pattern, text, pos)
|
||||
if off < len(hordt2):
|
||||
utext += unichr(hordt2[off])
|
||||
else:
|
||||
utext += unichr(off)
|
||||
pos += inc
|
||||
text = utext.encode("utf-8")
|
||||
|
||||
tagMap = getTagMap(
|
||||
controlByteCount,
|
||||
tagTable,
|
||||
data,
|
||||
startPos + 1 + textLength,
|
||||
endPos,
|
||||
)
|
||||
if 0x01 in tagMap:
|
||||
if decodeInflection and 0x2A in tagMap:
|
||||
inflectionGroups = self.getInflectionGroups(
|
||||
text,
|
||||
inflectionControlByteCount,
|
||||
inflectionTagTable,
|
||||
dinfl,
|
||||
inflNameData,
|
||||
tagMap[0x2A],
|
||||
)
|
||||
else:
|
||||
inflectionGroups = b""
|
||||
assert len(tagMap[0x01]) == 1
|
||||
entryStartPosition = tagMap[0x01][0]
|
||||
if hasEntryLength:
|
||||
# The idx:entry attribute "scriptable" must be present to create entry length tags.
|
||||
ml = (
|
||||
b'<idx:entry scriptable="yes"><idx:orth value="'
|
||||
+ text
|
||||
+ b'">'
|
||||
+ inflectionGroups
|
||||
+ b"</idx:orth>"
|
||||
)
|
||||
if entryStartPosition in positionMap:
|
||||
positionMap[entryStartPosition] = (
|
||||
positionMap[entryStartPosition] + ml
|
||||
)
|
||||
else:
|
||||
positionMap[entryStartPosition] = ml
|
||||
assert len(tagMap[0x02]) == 1
|
||||
entryEndPosition = entryStartPosition + tagMap[0x02][0]
|
||||
if entryEndPosition in positionMap:
|
||||
positionMap[entryEndPosition] = (
|
||||
b"</idx:entry>" + positionMap[entryEndPosition]
|
||||
)
|
||||
else:
|
||||
positionMap[entryEndPosition] = b"</idx:entry>"
|
||||
|
||||
else:
|
||||
indexTags = (
|
||||
b'<idx:entry>\n<idx:orth value="'
|
||||
+ text
|
||||
+ b'">\n'
|
||||
+ inflectionGroups
|
||||
+ b"</idx:entry>\n"
|
||||
)
|
||||
if entryStartPosition in positionMap:
|
||||
positionMap[entryStartPosition] = (
|
||||
positionMap[entryStartPosition] + indexTags
|
||||
)
|
||||
else:
|
||||
positionMap[entryStartPosition] = indexTags
|
||||
return positionMap
|
||||
|
||||
def hasTag(self, tagTable, tag):
|
||||
"""
|
||||
Test if tag table contains given tag.
|
||||
|
||||
@param tagTable: The tag table.
|
||||
@param tag: The tag to search.
|
||||
@return: True if tag table contains given tag; False otherwise.
|
||||
"""
|
||||
for currentTag, _, _, _ in tagTable:
|
||||
if currentTag == tag:
|
||||
return True
|
||||
return False
|
||||
|
||||
def getInflectionGroups(
|
||||
self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList
|
||||
):
|
||||
"""
|
||||
Create string which contains the inflection groups with inflection rules as mobipocket tags.
|
||||
|
||||
@param mainEntry: The word to inflect.
|
||||
@param controlByteCount: The number of control bytes.
|
||||
@param tagTable: The tag table.
|
||||
@param data: The Inflection data object to properly select the right inflection data section to use
|
||||
@param inflectionNames: The inflection rule name data.
|
||||
@param groupList: The list of inflection groups to process.
|
||||
@return: String with inflection groups and rules or empty string if required tags are not available.
|
||||
"""
|
||||
result = b""
|
||||
for value in groupList:
|
||||
offset, nextOffset, data = dinfl.offsets(value)
|
||||
|
||||
# First byte seems to be always 0x00 and must be skipped.
|
||||
assert ord(data[offset : offset + 1]) == 0x00
|
||||
tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
|
||||
|
||||
# Make sure that the required tags are available.
|
||||
if 0x05 not in tagMap:
|
||||
logger.debug("Error: Required tag 0x05 not found in tagMap")
|
||||
return ""
|
||||
if 0x1A not in tagMap:
|
||||
logger.debug("Error: Required tag 0x1a not found in tagMap")
|
||||
return b""
|
||||
|
||||
result += b"<idx:infl>"
|
||||
|
||||
for i in range(len(tagMap[0x05])):
|
||||
|
||||
# Get name of inflection rule.
|
||||
value = tagMap[0x05][i]
|
||||
consumed, textLength = getVariableWidthValue(inflectionNames, value)
|
||||
inflectionName = inflectionNames[
|
||||
value + consumed : value + consumed + textLength
|
||||
]
|
||||
|
||||
# Get and apply inflection rule across possibly multiple inflection data sections
|
||||
value = tagMap[0x1A][i]
|
||||
rvalue, start, count, data = dinfl.lookup(value)
|
||||
(offset,) = struct.unpack_from(b">H", data, start + 4 + (2 * rvalue))
|
||||
textLength = ord(data[offset : offset + 1])
|
||||
inflection = self.applyInflectionRule(
|
||||
mainEntry, data, offset + 1, offset + 1 + textLength
|
||||
)
|
||||
if inflection is not None:
|
||||
result += (
|
||||
b' <idx:iform name="'
|
||||
+ inflectionName
|
||||
+ b'" value="'
|
||||
+ inflection
|
||||
+ b'"/>'
|
||||
)
|
||||
|
||||
result += b"</idx:infl>"
|
||||
return result
|
||||
|
||||
def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
|
||||
"""
|
||||
Apply inflection rule.
|
||||
|
||||
@param mainEntry: The word to inflect.
|
||||
@param inflectionRuleData: The inflection rules.
|
||||
@param start: The start position of the inflection rule to use.
|
||||
@param end: The end position of the inflection rule to use.
|
||||
@return: The string with the inflected word or None if an error occurs.
|
||||
"""
|
||||
mode = -1
|
||||
byteArray = array.array(array_format, mainEntry)
|
||||
position = len(byteArray)
|
||||
for charOffset in range(start, end):
|
||||
char = inflectionRuleData[charOffset : charOffset + 1]
|
||||
abyte = ord(char)
|
||||
if abyte >= 0x0A and abyte <= 0x13:
|
||||
# Move cursor backwards
|
||||
offset = abyte - 0x0A
|
||||
if mode not in [0x02, 0x03]:
|
||||
mode = 0x02
|
||||
position = len(byteArray)
|
||||
position -= offset
|
||||
elif abyte > 0x13:
|
||||
if mode == -1:
|
||||
logger.debug(
|
||||
"Error: Unexpected first byte %i of inflection rule" % abyte
|
||||
)
|
||||
return None
|
||||
elif position == -1:
|
||||
logger.debug(
|
||||
"Error: Unexpected first byte %i of inflection rule" % abyte
|
||||
)
|
||||
return None
|
||||
else:
|
||||
if mode == 0x01:
|
||||
# Insert at word start
|
||||
byteArray.insert(position, abyte)
|
||||
position += 1
|
||||
elif mode == 0x02:
|
||||
# Insert at word end
|
||||
byteArray.insert(position, abyte)
|
||||
elif mode == 0x03:
|
||||
# Delete at word end
|
||||
position -= 1
|
||||
deleted = byteArray.pop(position)
|
||||
if bchr(deleted) != char:
|
||||
if DEBUG_DICT:
|
||||
logger.debug(
|
||||
"0x03: %s %s %s %s"
|
||||
% (
|
||||
mainEntry,
|
||||
toHex(inflectionRuleData[start:end]),
|
||||
char,
|
||||
bchr(deleted),
|
||||
)
|
||||
)
|
||||
logger.debug(
|
||||
"Error: Delete operation of inflection rule failed"
|
||||
)
|
||||
return None
|
||||
elif mode == 0x04:
|
||||
# Delete at word start
|
||||
deleted = byteArray.pop(position)
|
||||
if bchr(deleted) != char:
|
||||
if DEBUG_DICT:
|
||||
logger.debug(
|
||||
"0x03: %s %s %s %s"
|
||||
% (
|
||||
mainEntry,
|
||||
toHex(inflectionRuleData[start:end]),
|
||||
char,
|
||||
bchr(deleted),
|
||||
)
|
||||
)
|
||||
logger.debug(
|
||||
"Error: Delete operation of inflection rule failed"
|
||||
)
|
||||
return None
|
||||
else:
|
||||
logger.debug(
|
||||
"Error: Inflection rule mode %x is not implemented" % mode
|
||||
)
|
||||
return None
|
||||
elif abyte == 0x01:
|
||||
# Insert at word start
|
||||
if mode not in [0x01, 0x04]:
|
||||
position = 0
|
||||
mode = abyte
|
||||
elif abyte == 0x02:
|
||||
# Insert at word end
|
||||
if mode not in [0x02, 0x03]:
|
||||
position = len(byteArray)
|
||||
mode = abyte
|
||||
elif abyte == 0x03:
|
||||
# Delete at word end
|
||||
if mode not in [0x02, 0x03]:
|
||||
position = len(byteArray)
|
||||
mode = abyte
|
||||
elif abyte == 0x04:
|
||||
# Delete at word start
|
||||
if mode not in [0x01, 0x04]:
|
||||
position = 0
|
||||
# Delete at word start
|
||||
mode = abyte
|
||||
else:
|
||||
logger.debug(
|
||||
"Error: Inflection rule mode %x is not implemented" % abyte
|
||||
)
|
||||
return None
|
||||
return utf8_str(byteArray.tostring())
|
||||
Reference in New Issue
Block a user