#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function import os from loguru import logger __path__ = ["lib", os.path.dirname(os.path.realpath(__file__)), "kindleunpack"] import sys import codecs import traceback from .compatibility_utils import PY2, binary_type, utf8_str, unicode_str from .compatibility_utils import unicode_argv, add_cp65001_codec from .compatibility_utils import hexlify add_cp65001_codec() from .unipath import pathof if PY2: range = xrange # since will be printing unicode under python 2 need to protect # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding if sys.stdout.encoding is None: sys.stdout = codecs.getwriter("utf-8")(sys.stdout) else: encoding = sys.stdout.encoding sys.stdout = codecs.getwriter(encoding)(sys.stdout) # Changelog # 0.11 - Version by adamselene # 0.11pd - Tweaked version by pdurrant # 0.12 - extracts pictures too, and all into a folder. # 0.13 - added back in optional output dir for those who don't want it based on infile # 0.14 - auto flush stdout and wrapped in main, added proper return codes # 0.15 - added support for metadata # 0.16 - metadata now starting to be output as an opf file (PD) # 0.17 - Also created tweaked text as source for Mobipocket Creator # 0.18 - removed raw mobi file completely but kept _meta.html file for ease of conversion # 0.19 - added in metadata for ASIN, Updated Title and Rights to the opf # 0.20 - remove _meta.html since no longer needed # 0.21 - Fixed some typos in the opf output, and also updated handling # of test for trailing data/multibyte characters # 0.22 - Fixed problem with > 9 images # 0.23 - Now output Start guide item # 0.24 - Set firstaddl value for 'TEXtREAd' # 0.25 - Now added character set metadata to html file for utf-8 files. # 0.26 - Dictionary support added. Image handling speed improved. # For huge files create temp files to speed up decoding. # Language decoding fixed. Metadata is now converted to utf-8 when written to opf file. # 0.27 - Add idx:entry attribute "scriptable" if dictionary contains entry length tags. # Don't save non-image sections as images. Extract and save source zip file # included by kindlegen as kindlegensrc.zip. # 0.28 - Added back correct image file name extensions, created FastConcat class to simplify and clean up # 0.29 - Metadata handling reworked, multiple entries of the same type are now supported. # Several missing types added. # FastConcat class has been removed as in-memory handling with lists is faster, even for huge files. # 0.30 - Add support for outputting **all** metadata values - encode content with hex if of unknown type # 0.31 - Now supports Print Replica ebooks, outputting PDF and mysterious data sections # 0.32 - Now supports NCX file extraction/building. # Overhauled the structure of mobiunpack to be more class oriented. # 0.33 - Split Classes ito separate files and added prelim support for KF8 format eBooks # 0.34 - Improved KF8 support, guide support, bug fixes # 0.35 - Added splitting combo mobi7/mobi8 into standalone mobi7 and mobi8 files # Also handle mobi8-only file properly # 0.36 - very minor changes to support KF8 mobis with no flow items, no ncx, etc # 0.37 - separate output, add command line switches to control, interface to Mobi_Unpack.pyw # 0.38 - improve split function by resetting flags properly, fix bug in Thumbnail Images # 0.39 - improve split function so that ToC info is not lost for standalone mobi8s # 0.40 - make mobi7 split match official versions, add support for graphic novel metadata, # improve debug for KF8 # 0.41 - fix when StartOffset set to 0xffffffff, fix to work with older mobi versions, # fix other minor metadata issues # 0.42 - add new class interface to allow it to integrate more easily with internal calibre routines # 0.43 - bug fixes for new class interface # 0.44 - more bug fixes and fix for potnetial bug caused by not properly closing created zip archive # 0.45 - sync to version in the new Mobi_Unpack plugin # 0.46 - fixes for: obfuscated fonts, improper toc links and ncx, add support for opentype fonts # 0.47 - minor opf improvements # 0.48 - ncx link fixes # 0.49 - use azw3 when splitting mobis # 0.50 - unknown change # 0.51 - fix for converting filepos links to hrefs, Added GPL3 notice, made KF8 extension just '.azw3' # 0.52 - fix for cover metadata (no support for Mobipocket Creator) # 0.53 - fix for proper identification of embedded fonts, added new metadata items # 0.54 - Added error-handling so wonky embedded fonts don't bomb the whole unpack process, # entity escape KF8 metadata to ensure valid OPF. # 0.55 Strip extra StartOffset EXTH from the mobi8 header when splitting, keeping only the relevant one # For mobi8 files, don't generate duplicate guide entries from the metadata if we could extract one # from the OTH table. # 0.56 - Added further entity escaping of OPF text. # Allow unicode string file paths to be passed as arguments to the unpackBook method without blowing up later # when the attempt to "re"-unicode a portion of that filename occurs in the process_all_mobi_headers method. # 0.57 - Fixed eror when splitting Preview files downloaded from KDP website # 0.58 - Output original kindlegen build log ('CMET' record) if included in the package. # 0.58 - Include and extend functionality of DumpMobiHeader, replacing DEBUG with DUMP # 0.59 - Much added DUMP functionality, including full dumping and descriptions of sections # 0.60 - Bug fixes in opf, div tables, bad links, page breaks, section descriptions # - plus a number of other bug fixed that were found by Sergey Dubinets # - fixs for file/paths that require full unicode to work properly # - replace subprocess with multiprocessing to remove need for unbuffered stdout # 0.61 - renamed to be KindleUnpack and more unicode/utf-8 path bug fixes and other minor fixes # 0.62 - fix for multiprocessing on Windows, split fixes, opf improvements # 0.63 - Modified to process right to left page progression books properly. # - Added some id_map_strings and RESC section processing; metadata and # - spine in the RESC are integrated partly to content.opf. # 0.63a- Separated K8 RESC processor to an individual file. Bug fixes. Added cover page creation. # 0.64 - minor bug fixes to more properly handle unicode command lines, and support for more jpeg types # 0.64a- Modifed to handle something irregular mobi and azw3 files. # 0.64b- Modifed to create k8resc.spine for no RECS files. # 0.65 - Bug fixes to shorten title and remove epub3 "properties" to make the output epub2 compliant # 0.65a- Bug fixes to extract RESC section correctly, to prevent item id confliction # - and to process multiline comments in RESC. # 0.66 - Bug fix to deal with missing first resource information sometimes generated by calibre # 0.66a- Fixed minor bugs, which probably do not affect the output anything # 0.67 - Fixed Mobi Split functionality bug with azw3 images not being properly copied # 0.68 - preliminary support for handling PAGE sections to create page-map.xml # 0.69 - preliminary support for CONT and CRES for HD Images # 0.70 - preliminary support for decoding apnx files when used with azw3 ebooks # 0.71 - extensive refactoring of kindleunpack.py to make it more manageable # 0.72 - many bug fixes from tkeo: fix pageProcessing, fix print replica, fix resc usage, fix font mangling, etc. # 0.72a- fix for still broken PrintReplica support # 0.72b- preview for primary epub3 support. A parameter epubver(default='2') is added to process_all_mobi_headers(), unpackBook(). # 0.72c- preview for apnx page support # 0.72d- more bugs fixed in preview features, much improved GUI with ability to dynaically grow the Log Window with preference support # 0.72e- more bug fixes, Tk GUI adds support for epub version and HDImage use # 0.72f- more bug fixes, implement use hd images if present # 0.72g- minor bug fixes and cleanups from tkeo # 0.72h- updated mobi_header and mobi_k8proc to use the correct fragment and guide terms in place of div and other # to better match the terms that both Calibre and Amazon use internally to their own software # 0.72x- very experimental conversion to use new mobi_k8resc.py and some of its associated changes # 0.72y- more changes to simplify and integrate in epub3 support in a simpler manner # 0.72z- remove redundancy in mobi_opf.py and bug fixes for mobi_k8resc.py # 0.73 faster mobi split, numerous bug fixes in mobi_k8proc, mobi_header, mobi_opf, mobi_k8resc, etc # 0.74 added refines metadata, fixed language code in ncx and title in nav, added support for opf: from refines # 0.75 much improved dictioanry support including support for multiple inflection sections, minor mobi_opf fixes # 0.76 pre-release version only fix name related issues in opf by not using original file name in mobi7 # 0.77 bug fix for unpacking HDImages with included Fonts # 0.80 converted to work with both python 2.7 and Python 3.3 and later # 0.81 various fixes # 0.82 Handle calibre-generated mobis that can have skeletons with no fragments DUMP = True """ Set to True to dump all possible information. """ WRITE_RAW_DATA = False """ Set to True to create additional files with raw data for debugging/reverse engineering. """ SPLIT_COMBO_MOBIS = False """ Set to True to split combination mobis into mobi7 and mobi8 pieces. """ CREATE_COVER_PAGE = True # XXX experimental """ Create and insert a cover xhtml page. """ EOF_RECORD = b"\xe9\x8e" + b"\r\n" """ The EOF record content. """ TERMINATION_INDICATOR1 = b"\x00" TERMINATION_INDICATOR2 = b"\x00\x00" TERMINATION_INDICATOR3 = b"\x00\x00\x00" KINDLEGENSRC_FILENAME = "kindlegensrc.zip" """ The name for the kindlegen source archive. """ KINDLEGENLOG_FILENAME = "kindlegenbuild.log" """ The name for the kindlegen build log. """ K8_BOUNDARY = b"BOUNDARY" """ The section data that divides K8 mobi ebooks. """ import os import struct import re import zlib import getopt class unpackException(Exception): pass # import the kindleunpack support libraries from .unpack_structure import fileNames from .mobi_sectioner import Sectionizer, describe from .mobi_header import MobiHeader, dump_contexth from .mobi_utils import toBase32 from .mobi_opf import OPFProcessor from .mobi_html import HTMLProcessor, XHTMLK8Processor from .mobi_ncx import ncxExtract from .mobi_k8proc import K8Processor from .mobi_split import mobi_split from .mobi_k8resc import K8RESCProcessor from .mobi_nav import NAVProcessor from .mobi_cover import CoverProcessor, get_image_type from .mobi_pagemap import PageMapProcessor from .mobi_dict import dictSupport def processSRCS(i, files, rscnames, sect, data): # extract the source zip archive and save it. logger.debug( "File contains kindlegen source archive, extracting as %s" % KINDLEGENSRC_FILENAME ) srcname = os.path.join(files.outdir, KINDLEGENSRC_FILENAME) with open(pathof(srcname), "wb") as f: f.write(data[16:]) rscnames.append(None) sect.setsectiondescription(i, "Zipped Source Files") return rscnames def processPAGE(i, files, rscnames, sect, data, mh, pagemapproc): # process any page map information and create an apnx file pagemapproc = PageMapProcessor(mh, data) rscnames.append(None) sect.setsectiondescription(i, "PageMap") apnx_meta = {} acr = sect.palmname.decode("latin-1").rstrip("\x00") apnx_meta["acr"] = acr apnx_meta["cdeType"] = mh.metadata["cdeType"][0] apnx_meta["contentGuid"] = hex(int(mh.metadata["UniqueID"][0]))[2:] apnx_meta["asin"] = mh.metadata["ASIN"][0] apnx_meta["pageMap"] = pagemapproc.getPageMap() if mh.version == 8: apnx_meta["format"] = "MOBI_8" else: apnx_meta["format"] = "MOBI_7" apnx_data = pagemapproc.generateAPNX(apnx_meta) if mh.isK8(): outname = os.path.join( files.outdir, "mobi8-" + files.getInputFileBasename() + ".apnx" ) else: outname = os.path.join( files.outdir, "mobi7-" + files.getInputFileBasename() + ".apnx" ) with open(pathof(outname), "wb") as f: f.write(apnx_data) return rscnames, pagemapproc def processCMET(i, files, rscnames, sect, data): # extract the build log logger.debug( "File contains kindlegen build log, extracting as %s" % KINDLEGENLOG_FILENAME ) srcname = os.path.join(files.outdir, KINDLEGENLOG_FILENAME) with open(pathof(srcname), "wb") as f: f.write(data[10:]) rscnames.append(None) sect.setsectiondescription(i, "Kindlegen log") return rscnames # fonts only exist in KF8 ebooks # Format: bytes 0 - 3: 'FONT' # bytes 4 - 7: uncompressed size # bytes 8 - 11: flags # flag bit 0x0001 - zlib compression # flag bit 0x0002 - obfuscated with xor string # bytes 12 - 15: offset to start of compressed font data # bytes 16 - 19: length of xor string stored before the start of the comnpress font data # bytes 20 - 23: start of xor string def processFONT(i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr): fontname = "font%05d" % i ext = ".dat" font_error = False font_data = data try: usize, fflags, dstart, xor_len, xor_start = struct.unpack_from( b">LLLLL", data, 4 ) except: logger.debug( "Failed to extract font: {0:s} from section {1:d}".format(fontname, i) ) font_error = True ext = ".failed" pass if not font_error: logger.debug("Extracting font:", fontname) font_data = data[dstart:] extent = len(font_data) extent = min(extent, 1040) if fflags & 0x0002: # obfuscated so need to de-obfuscate the first 1040 bytes key = bytearray(data[xor_start : xor_start + xor_len]) buf = bytearray(font_data) for n in range(extent): buf[n] ^= key[n % xor_len] font_data = bytes(buf) if fflags & 0x0001: # ZLIB compressed data font_data = zlib.decompress(font_data) hdr = font_data[0:4] if hdr == b"\0\1\0\0" or hdr == b"true" or hdr == b"ttcf": ext = ".ttf" elif hdr == b"OTTO": ext = ".otf" else: logger.debug("Warning: unknown font header %s" % hexlify(hdr)) if (ext == ".ttf" or ext == ".otf") and (fflags & 0x0002): obfuscate_data.append(fontname + ext) fontname += ext outfnt = os.path.join(files.imgdir, fontname) with open(pathof(outfnt), "wb") as f: f.write(font_data) rscnames.append(fontname) sect.setsectiondescription(i, "Font {0:s}".format(fontname)) if rsc_ptr == -1: rsc_ptr = i - beg return rscnames, obfuscate_data, rsc_ptr def processCRES(i, files, rscnames, sect, data, beg, rsc_ptr, use_hd): # extract an HDImage global DUMP data = data[12:] imgtype = get_image_type(None, data) if imgtype is None: logger.debug( "Warning: CRES Section %s does not contain a recognised resource" % i ) rscnames.append(None) sect.setsectiondescription( i, "Mysterious CRES data, first four bytes %s" % describe(data[0:4]) ) if DUMP: fname = "unknown%05d.dat" % i outname = os.path.join(files.outdir, fname) with open(pathof(outname), "wb") as f: f.write(data) sect.setsectiondescription( i, "Mysterious CRES data, first four bytes %s extracting as %s" % (describe(data[0:4]), fname), ) rsc_ptr += 1 return rscnames, rsc_ptr if use_hd: # overwrite corresponding lower res image with hd version imgname = rscnames[rsc_ptr] imgdest = files.imgdir else: imgname = "HDimage%05d.%s" % (i, imgtype) imgdest = files.hdimgdir logger.debug("Extracting HD image: {0:s} from section {1:d}".format(imgname, i)) outimg = os.path.join(imgdest, imgname) with open(pathof(outimg), "wb") as f: f.write(data) rscnames.append(None) sect.setsectiondescription(i, "Optional HD Image {0:s}".format(imgname)) rsc_ptr += 1 return rscnames, rsc_ptr def processCONT(i, files, rscnames, sect, data): global DUMP # process a container header, most of this is unknown # right now only extract its EXTH dt = data[0:12] if dt == b"CONTBOUNDARY": rscnames.append(None) sect.setsectiondescription(i, "CONTAINER BOUNDARY") else: sect.setsectiondescription(i, "CONT Header") rscnames.append(None) if DUMP: (cpage,) = struct.unpack_from(b">L", data, 12) contexth = data[48:] logger.debug("\n\nContainer EXTH Dump") dump_contexth(cpage, contexth) fname = "CONT_Header%05d.dat" % i outname = os.path.join(files.outdir, fname) with open(pathof(outname), "wb") as f: f.write(data) return rscnames def processkind(i, files, rscnames, sect, data): global DUMP dt = data[0:12] if dt == b"kindle:embed": if DUMP: logger.debug("\n\nHD Image Container Description String") logger.debug(data) sect.setsectiondescription(i, "HD Image Container Description String") rscnames.append(None) return rscnames # spine information from the original content.opf def processRESC(i, files, rscnames, sect, data, k8resc): global DUMP if DUMP: rescname = "RESC%05d.dat" % i logger.debug("Extracting Resource: ", rescname) outrsc = os.path.join(files.outdir, rescname) with open(pathof(outrsc), "wb") as f: f.write(data) if True: # try: # parse the spine and metadata from RESC k8resc = K8RESCProcessor(data[16:], DUMP) else: # except: logger.debug("Warning: cannot extract information from RESC.") k8resc = None rscnames.append(None) sect.setsectiondescription(i, "K8 RESC section") return rscnames, k8resc # CG image write to data and file def processImage(i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset): global DUMP # Extract an Image imgtype = get_image_type(None, data) if imgtype is None: logger.debug("Warning: Section %s does not contain a recognised resource" % i) rscnames.append(None) sect.setsectiondescription( i, "Mysterious Section, first four bytes %s" % describe(data[0:4]) ) if DUMP: fname = "unknown%05d.dat" % i outname = os.path.join(files.outdir, fname) with open(pathof(outname), "wb") as f: f.write(data) sect.setsectiondescription( i, "Mysterious Section, first four bytes %s extracting as %s" % (describe(data[0:4]), fname), ) return rscnames, rsc_ptr imgname = "image%05d.%s" % (i, imgtype) if cover_offset is not None and i == beg + cover_offset: imgname = "cover%05d.%s" % (i, imgtype) logger.debug("Extracting image: {0:s} from section {1:d}".format(imgname, i)) outimg = os.path.join(files.imgdir, imgname) with open(pathof(outimg), "wb") as f: f.write(data) rscnames.append(imgname) sect.setsectiondescription(i, "Image {0:s}".format(imgname)) if rsc_ptr == -1: rsc_ptr = i - beg return rscnames, rsc_ptr def processPrintReplica(metadata, files, rscnames, mh): global DUMP global WRITE_RAW_DATA rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.outdir, files.getInputFileBasename() + ".rawpr") with open(pathof(outraw), "wb") as f: f.write(rawML) fileinfo = [] logger.debug("Print Replica ebook detected") try: (numTables,) = struct.unpack_from(b">L", rawML, 0x04) tableIndexOffset = 8 + 4 * numTables # for each table, read in count of sections, assume first section is a PDF # and output other sections as binary files for i in range(numTables): (sectionCount,) = struct.unpack_from(b">L", rawML, 0x08 + 4 * i) for j in range(sectionCount): sectionOffset, sectionLength, = struct.unpack_from( b">LL", rawML, tableIndexOffset ) tableIndexOffset += 8 if j == 0: entryName = os.path.join( files.outdir, files.getInputFileBasename() + (".%03d.pdf" % (i + 1)), ) else: entryName = os.path.join( files.outdir, files.getInputFileBasename() + (".%03d.%03d.data" % ((i + 1), j)), ) with open(pathof(entryName), "wb") as f: f.write(rawML[sectionOffset : (sectionOffset + sectionLength)]) except Exception as e: logger.debug("Error processing Print Replica: " + str(e)) fileinfo.append([None, "", files.getInputFileBasename() + ".pdf"]) usedmap = {} for name in rscnames: if name is not None: usedmap[name] = "used" opf = OPFProcessor(files, metadata, fileinfo, rscnames, False, mh, usedmap) opf.writeOPF() def processMobi8( mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile=None, epubver="2", ): global DUMP global WRITE_RAW_DATA # extract raw markup langauge rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.k8dir, files.getInputFileBasename() + ".rawml") with open(pathof(outraw), "wb") as f: f.write(rawML) # KF8 require other indexes which contain parsing information and the FDST info # to process the rawml back into the xhtml files, css files, svg image files, etc k8proc = K8Processor(mh, sect, files, DUMP) k8proc.buildParts(rawML) # collect information for the guide first guidetext = unicode_str(k8proc.getGuideText()) # if the guide was empty, add in any guide info from metadata, such as StartOffset if not guidetext and "StartOffset" in metadata: # Apparently, KG 2.5 carries over the StartOffset from the mobi7 part... # Taking that into account, we only care about the *last* StartOffset, which # should always be the correct one in these cases (the one actually pointing # to the right place in the mobi8 part). starts = metadata["StartOffset"] last_start = starts[-1] last_start = int(last_start) if last_start == 0xFFFFFFFF: last_start = 0 seq, idtext = k8proc.getFragTblInfo(last_start) filename, idtext = k8proc.getIDTagByPosFid(toBase32(seq), b"0000000000") linktgt = filename idtext = unicode_str(idtext, mh.codec) if idtext != "": linktgt += "#" + idtext guidetext += '\n' % linktgt # if apnxfile is passed in use it for page map information if apnxfile is not None and pagemapproc is None: with open(apnxfile, "rb") as f: apnxdata = b"00000000" + f.read() pagemapproc = PageMapProcessor(mh, apnxdata) # generate the page map pagemapxml = "" if pagemapproc is not None: pagemapxml = pagemapproc.generateKF8PageMapXML(k8proc) outpm = os.path.join(files.k8oebps, "page-map.xml") with open(pathof(outpm), "wb") as f: f.write(pagemapxml.encode("utf-8")) if DUMP: logger.debug(pagemapproc.getNames()) logger.debug(pagemapproc.getOffsets()) logger.debug("\n\nPage Map") logger.debug(pagemapxml) # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num logger.debug("Processing ncx / toc ") ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() print('ncx_data K8 {}'.format(ncx_data)) # extend the ncx data with filenames and proper internal idtags for i in range(len(ncx_data)): ncxmap = ncx_data[i] [junk1, junk2, junk3, fid, junk4, off] = ncxmap["pos_fid"].split(":") filename, idtag = k8proc.getIDTagByPosFid(fid, off) ncxmap["filename"] = filename ncxmap["idtag"] = unicode_str(idtag) ncx_data[i] = ncxmap # convert the rawML to a set of xhtml files logger.debug("Building an epub-like structure") htmlproc = XHTMLK8Processor(rscnames, k8proc) usedmap = htmlproc.buildXHTML() # write out the xhtml svg, and css files # fileinfo = [skelid|coverpage, dir, name] fileinfo = [] # first create a cover page if none exists if CREATE_COVER_PAGE: cover = CoverProcessor(files, metadata, rscnames) cover_img = utf8_str(cover.getImageName()) need_to_create_cover_page = False if cover_img is not None: if k8resc is None or not k8resc.hasSpine(): part = k8proc.getPart(0) if part.find(cover_img) == -1: need_to_create_cover_page = True else: if "coverpage" not in k8resc.spine_idrefs: part = k8proc.getPart(int(k8resc.spine_order[0])) if part.find(cover_img) == -1: k8resc.prepend_to_spine("coverpage", "inserted", "no", None) if k8resc.spine_order[0] == "coverpage": need_to_create_cover_page = True if need_to_create_cover_page: filename = cover.getXHTMLName() fileinfo.append(["coverpage", "Text", filename]) guidetext += cover.guide_toxml() cover.writeXHTML() n = k8proc.getNumberOfParts() for i in range(n): part = k8proc.getPart(i) [skelnum, dir, filename, beg, end, aidtext] = k8proc.getPartInfo(i) fileinfo.append([str(skelnum), dir, filename]) fname = os.path.join(files.k8oebps, dir, filename) with open(pathof(fname), "wb") as f: f.write(part) n = k8proc.getNumberOfFlows() for i in range(1, n): [ptype, pformat, pdir, filename] = k8proc.getFlowInfo(i) flowpart = k8proc.getFlow(i) if pformat == b"file": fileinfo.append([None, pdir, filename]) fname = os.path.join(files.k8oebps, pdir, filename) with open(pathof(fname), "wb") as f: f.write(flowpart) # create the opf opf = OPFProcessor( files, metadata.copy(), fileinfo, rscnames, True, mh, usedmap, pagemapxml=pagemapxml, guidetext=guidetext, k8resc=k8resc, epubver=epubver, ) uuid = opf.writeOPF(bool(obfuscate_data)) if opf.hasNCX(): # CGDBG # Create a toc.ncx. # ncx.writeK8NCX(ncx_data, metadata) pass if opf.hasNAV(): # Create a navigation document. nav = NAVProcessor(files) nav.writeNAV(ncx_data, guidetext, metadata) # make an epub-like structure of it all logger.debug("Creating an epub-like file") files.makeEPUB(usedmap, obfuscate_data, uuid) def processMobi7(mh, metadata, sect, files, rscnames): global DUMP global WRITE_RAW_DATA # An original Mobi rawML = mh.getRawML() if DUMP or WRITE_RAW_DATA: outraw = os.path.join(files.mobi7dir, files.getInputFileBasename() + ".rawml") with open(pathof(outraw), "wb") as f: f.write(rawML) # process the toc ncx # ncx map keys: name, pos, len, noffs, text, hlvl, kind, pos_fid, parent, child1, childn, num ncx = ncxExtract(mh, files) ncx_data = ncx.parseNCX() print('ncx_data K7 {}'.format(ncx_data)) # CGDBG write ncx information to toc.ncx # ncx.writeNCX(metadata) positionMap = {} # if Dictionary build up the positionMap if mh.isDictionary(): if mh.DictInLanguage(): metadata["DictInLanguage"] = [mh.DictInLanguage()] if mh.DictOutLanguage(): metadata["DictOutLanguage"] = [mh.DictOutLanguage()] positionMap = dictSupport(mh, sect).getPositionMap() # convert the rawml back to Mobi ml proc = HTMLProcessor(files, metadata, rscnames) srctext = proc.findAnchors(rawML, ncx_data, positionMap) srctext, usedmap = proc.insertHREFS() # write the proper mobi html fileinfo = [] # fname = files.getInputFileBasename() + '.html' fname = "book.html" fileinfo.append([None, "", fname]) outhtml = os.path.join(files.mobi7dir, fname) with open(pathof(outhtml), "wb") as f: f.write(srctext) # extract guidetext from srctext guidetext = b"" # no pagemap support for older mobis # pagemapxml = None guidematch = re.search( br"""(.*)""", srctext, re.IGNORECASE + re.DOTALL ) if guidematch: guidetext = guidematch.group(1) # sometimes old mobi guide from srctext horribly written so need to clean up guidetext = guidetext.replace(b"\r", b"") guidetext = guidetext.replace(b"]*>)""", re.IGNORECASE) guidepieces = ref_tag_pattern.split(guidetext) for i in range(1, len(guidepieces), 2): reftag = guidepieces[i] # remove any href there now to replace with filepos reftag = re.sub(br"""href\s*=[^'"]*['"][^'"]*['"]""", b"", reftag) # make sure the reference tag ends properly if not reftag.endswith(b"/>"): reftag = reftag[0:-1] + b"/>" guidepieces[i] = reftag guidetext = b"".join(guidepieces) replacetext = br'''href="''' + utf8_str(fileinfo[0][2]) + br'''#filepos\1"''' guidetext = re.sub( br"""filepos=['"]{0,1}0*(\d+)['"]{0,1}""", replacetext, guidetext ) guidetext += b"\n" if "StartOffset" in metadata: for value in metadata["StartOffset"]: if int(value) == 0xFFFFFFFF: value = "0" starting_offset = value # get guide items from metadata metaguidetext = ( b'\n' ) guidetext += metaguidetext if isinstance(guidetext, binary_type): guidetext = guidetext.decode(mh.codec) # create an OPF opf = OPFProcessor( files, metadata, fileinfo, rscnames, ncx.isNCX, mh, usedmap, guidetext=guidetext ) opf.writeOPF() def processUnknownSections(mh, sect, files, K8Boundary): global DUMP global TERMINATION_INDICATOR1 global TERMINATION_INDICATOR2 global TERMINATION_INDICATOR3 if DUMP: logger.debug("Unpacking any remaining unknown records") beg = mh.start end = sect.num_sections if beg < K8Boundary: # then we're processing the first part of a combination file end = K8Boundary for i in range(beg, end): if sect.sectiondescriptions[i] == "": data = sect.loadSection(i) type = data[0:4] if type == TERMINATION_INDICATOR3: description = "Termination Marker 3 Nulls" elif type == TERMINATION_INDICATOR2: description = "Termination Marker 2 Nulls" elif type == TERMINATION_INDICATOR1: description = "Termination Marker 1 Null" elif type == "INDX": fname = "Unknown%05d_INDX.dat" % i description = "Unknown INDX section" if DUMP: outname = os.path.join(files.outdir, fname) with open(pathof(outname), "wb") as f: f.write(data) logger.debug( "Extracting %s: %s from section %d" % (description, fname, i) ) description = description + ", extracting as %s" % fname else: fname = "unknown%05d.dat" % i description = "Mysterious Section, first four bytes %s" % describe( data[0:4] ) if DUMP: outname = os.path.join(files.outdir, fname) with open(pathof(outname), "wb") as f: f.write(data) logger.debug( "Extracting %s: %s from section %d" % (description, fname, i) ) description = description + ", extracting as %s" % fname sect.setsectiondescription(i, description) def process_all_mobi_headers( files, apnxfile, sect, mhlst, K8Boundary, k8only=False, epubver="2", use_hd=False ): global DUMP global WRITE_RAW_DATA rscnames = [] rsc_ptr = -1 k8resc = None obfuscate_data = [] # CGDBG print('mhlst {}'.format(mhlst)) for mh in mhlst: print('mh {}'.format(mh)) pagemapproc = None if mh.isK8(): sect.setsectiondescription(mh.start, "KF8 Header") mhname = os.path.join(files.outdir, "header_K8.dat") logger.debug("Processing K8 section of book...") elif mh.isPrintReplica(): sect.setsectiondescription(mh.start, "Print Replica Header") mhname = os.path.join(files.outdir, "header_PR.dat") logger.debug("Processing PrintReplica section of book...") else: if mh.version == 0: sect.setsectiondescription( mh.start, "PalmDoc Header".format(mh.version) ) else: sect.setsectiondescription( mh.start, "Mobipocket {0:d} Header".format(mh.version) ) mhname = os.path.join(files.outdir, "header.dat") logger.debug( "Processing Mobipocket {0:d} section of book...".format(mh.version) ) if DUMP: # write out raw mobi header data with open(pathof(mhname), "wb") as f: f.write(mh.header) # process each mobi header metadata = mh.getMetaData() mh.describeHeader(DUMP) if mh.isEncrypted(): raise unpackException("Book is encrypted") pagemapproc = None # first handle all of the different resource sections: images, resources, fonts, and etc # build up a list of image names to use to postprocess the ebook logger.debug("Unpacking images, resources, fonts, etc") beg = mh.firstresource end = sect.num_sections if beg < K8Boundary: # processing first part of a combination file end = K8Boundary cover_offset = int(metadata.get("CoverOffset", ["-1"])[0]) if not CREATE_COVER_PAGE: cover_offset = None # 写入局部变量data作用? for i in range(beg, end): # data只作临时变量处理数据 data = sect.loadSection(i) type = data[0:4] # handle the basics first if type in [b"FLIS", b"FCIS", b"FDST", b"DATP"]: if DUMP: fname = unicode_str(type) + "%05d" % i if mh.isK8(): fname += "_K8" fname += ".dat" outname = os.path.join(files.outdir, fname) with open(pathof(outname), "wb") as f: f.write(data) logger.debug( "Dumping section {0:d} type {1:s} to file {2:s} ".format( i, unicode_str(type), outname ) ) sect.setsectiondescription(i, "Type {0:s}".format(unicode_str(type))) rscnames.append(None) elif type == b"SRCS": rscnames = processSRCS(i, files, rscnames, sect, data) elif type == b"PAGE": rscnames, pagemapproc = processPAGE( i, files, rscnames, sect, data, mh, pagemapproc ) elif type == b"CMET": rscnames = processCMET(i, files, rscnames, sect, data) elif type == b"FONT": rscnames, obfuscate_data, rsc_ptr = processFONT( i, files, rscnames, sect, data, obfuscate_data, beg, rsc_ptr ) elif type == b"CRES": rscnames, rsc_ptr = processCRES( i, files, rscnames, sect, data, beg, rsc_ptr, use_hd ) elif type == b"CONT": rscnames = processCONT(i, files, rscnames, sect, data) elif type == b"kind": rscnames = processkind(i, files, rscnames, sect, data) elif type == b"\xa0\xa0\xa0\xa0": sect.setsectiondescription(i, "Empty_HD_Image/Resource_Placeholder") rscnames.append(None) rsc_ptr += 1 elif type == b"RESC": rscnames, k8resc = processRESC(i, files, rscnames, sect, data, k8resc) elif data == EOF_RECORD: sect.setsectiondescription(i, "End Of File") rscnames.append(None) elif data[0:8] == b"BOUNDARY": sect.setsectiondescription(i, "BOUNDARY Marker") rscnames.append(None) else: # if reached here should be an image ow treat as unknown rscnames, rsc_ptr = processImage( i, files, rscnames, sect, data, beg, rsc_ptr, cover_offset ) # CGDBG print('rscnames {}, rsc_ptr {} data {}'.format( rscnames, rsc_ptr, data )) # done unpacking resources # Print Replica if mh.isPrintReplica() and not k8only: processPrintReplica(metadata, files, rscnames, mh) continue # KF8 (Mobi 8) if mh.isK8(): processMobi8( mh, metadata, sect, files, rscnames, pagemapproc, k8resc, obfuscate_data, apnxfile, epubver, ) # Old Mobi (Mobi 7) elif not k8only: processMobi7(mh, metadata, sect, files, rscnames) # CGDBG print('k8only {} mh.isK8() {}'.format(k8only, mh.isK8())) # process any remaining unknown sections of the palm file processUnknownSections(mh, sect, files, K8Boundary) return def unpackBook( infile, outdir, apnxfile=None, epubver="2", use_hd=False, dodump=False, dowriteraw=False, dosplitcombos=False, ): global DUMP global WRITE_RAW_DATA global SPLIT_COMBO_MOBIS if DUMP or dodump: DUMP = True if WRITE_RAW_DATA or dowriteraw: WRITE_RAW_DATA = True if SPLIT_COMBO_MOBIS or dosplitcombos: SPLIT_COMBO_MOBIS = True infile = unicode_str(infile) outdir = unicode_str(outdir) if apnxfile is not None: apnxfile = unicode_str(apnxfile) files = fileNames(infile, outdir) # process the PalmDoc database header and verify it is a mobi sect = Sectionizer(infile) if sect.ident != b"BOOKMOBI" and sect.ident != b"TEXtREAd": raise unpackException("Invalid file format") if DUMP: logger.debug( "dumppalmheader ...") sect.dumppalmheader() else: logger.debug( "Palm DB type: %s, %d sections." % (sect.ident.decode("utf-8"), sect.num_sections) ) # CGDBG print('infile {} outdir {}'.format(infile, outdir)) print('sect.dumpsectionsinfo() {}'.format(sect.dumpsectionsinfo())) print('sect.dumppalmheader() {}'.format(sect.dumppalmheader())) # scan sections to see if this is a compound mobi file (K8 format) # and build a list of all mobi headers to process. mhlst = [] # CG mobi header mh = MobiHeader(sect, 0) # if this is a mobi8-only file hasK8 here will be true mhlst.append(mh) K8Boundary = -1 if mh.isK8(): logger.debug("Unpacking a KF8 book...") hasK8 = True else: # CGDBG # This is either a Mobipocket 7 or earlier, or a combi M7/KF8 # Find out which hasK8 = False for i in range(len(sect.sectionoffsets) - 1): before, after = sect.sectionoffsets[i : i + 2] if (after - before) == 8: data = sect.loadSection(i) if data == K8_BOUNDARY: sect.setsectiondescription(i, "Mobi/KF8 Boundary Section") mh = MobiHeader(sect, i + 1) hasK8 = True # K8 mhlst.append(mh) K8Boundary = i break if hasK8: logger.debug( "Unpacking a Combination M{0:d}/KF8 book...".format(mh.version) ) if SPLIT_COMBO_MOBIS: # if this is a combination mobi7-mobi8 file split them up mobisplit = mobi_split(infile) if mobisplit.combo: outmobi7 = os.path.join( files.outdir, "mobi7-" + files.getInputFileBasename() + ".mobi" ) outmobi8 = os.path.join( files.outdir, "mobi8-" + files.getInputFileBasename() + ".azw3" ) with open(pathof(outmobi7), "wb") as f: f.write(mobisplit.getResult7()) with open(pathof(outmobi8), "wb") as f: f.write(mobisplit.getResult8()) else: logger.debug("Unpacking a Mobipocket {0:d} book...".format(mh.version)) if hasK8: files.makeK8Struct() # CGDBG mhlst - mobi_header list # process all mobi books process_all_mobi_headers( files, apnxfile, sect, mhlst, K8Boundary, False, epubver, use_hd ) if DUMP: sect.dumpsectionsinfo() return def usage(progname): print("") print("Description:") print(" Unpacks an unencrypted Kindle/MobiPocket ebook to html and images") print(" or an unencrypted Kindle/Print Replica ebook to PDF and images") print(" into the specified output folder.") print("Usage:") print(" %s -r -s -p apnxfile -d -h --epub_version= infile [outdir]" % progname) print("Options:") print(" -h print this help message") print( " -i use HD Images, if present, to overwrite reduced resolution images" ) print(" -s split combination mobis into mobi7 and mobi8 ebooks") print( " -p APNXFILE path to an .apnx file associated with the azw3 input (optional)" ) print( " --epub_version= specify epub version to unpack to: 2, 3, A (for automatic) or " ) print( " F (force to fit to epub2 definitions), default is 2" ) print( " -d dump headers and other info to output and extra files" ) print(" -r write raw data to the output folder") def main(argv=unicode_argv()): global DUMP global WRITE_RAW_DATA global SPLIT_COMBO_MOBIS print("KindleUnpack v0.82") print( " Based on initial mobipocket version Copyright © 2009 Charles M. Hannum " ) print(" Extensive Extensions and Improvements Copyright © 2009-2014 ") print( " by: P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo." ) print(" This program is free software: you can redistribute it and/or modify") print(" it under the terms of the GNU General Public License as published by") print(" the Free Software Foundation, version 3.") progname = os.path.basename(argv[0]) try: opts, args = getopt.getopt(argv[1:], "dhirsp:", ["epub_version="]) except getopt.GetoptError as err: print(str(err)) usage(progname) sys.exit(2) if len(args) < 1: usage(progname) sys.exit(2) apnxfile = None epubver = "2" use_hd = False for o, a in opts: if o == "-h": usage(progname) sys.exit(0) if o == "-i": use_hd = True if o == "-d": DUMP = True if o == "-r": WRITE_RAW_DATA = True if o == "-s": SPLIT_COMBO_MOBIS = True if o == "-p": apnxfile = a if o == "--epub_version": epubver = a if len(args) > 1: infile, outdir = args else: infile = args[0] outdir = os.path.splitext(infile)[0] infileext = os.path.splitext(infile)[1].upper() if infileext not in [".MOBI", ".PRC", ".AZW", ".AZW3", ".AZW4"]: print( "Error: first parameter must be a Kindle/Mobipocket ebook or a Kindle/Print Replica ebook." ) return 1 try: print("Unpacking Book...") unpackBook(infile, outdir, apnxfile, epubver, use_hd) print("Completed") except ValueError as e: print("Error: %s" % e) print(traceback.format_exc()) return 1 return 0 if __name__ == "__main__": sys.exit(main())