如何解析 ToC 条目的目标页面#
How to resolve the target page of ToC entries
pdfminer.six 允许通过方法 PDFDocument.get_outlines()
访问文档的目录(在 PDF 的内部结构中称为 “Outlines”)。
一个最小的示例如下:
from pathlib import Path
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
file_name = Path("...")
with open(file_name, "rb") as fp:
try:
parser = PDFParser(fp)
document = PDFDocument(parser)
outlines = document.get_outlines()
for (level, title, dest, a, se) in outlines:
... # 执行某些操作
except PDFNoOutlines:
print("未找到目录。")
except PDFSyntaxError:
print("PDF 文件损坏或不是 PDF 文件。")
finally:
parser.close()
但是,每个目录项的不同字段代表什么含义呢?要回答这个问题,我们可以参考 PDF 参考手册 的 12.3.3 Document Outline 章节:
Level (
int
): 该条目所在的层级。例如,顶级条目的 level 值为1
,其子条目的 level 值为2
,依此类推。Title (
str
): 条目的名称,例如 “1. Introduction”。- Dest (
Union[list, bytes]
, 可选): 该字段用于指示条目目标对象(可以是页面或其他对象)。 如果 Dest 字段存在,则 A 字段不会出现。 具体的目标定义方式可参考 PDF 参考手册 的 12.3.2 Destinations 章节。
- Dest (
- A (
pdfminer.pdftypes.PDFObjRef
, 可选): 除了 Dest,还可以使用 A 字段来定义目标,该字段表示一个操作(Action)。 详细信息请参考 12.6 Actions 章节。
- A (
- SE (
pdfminer.pdftypes.PDFObjRef
, 可选): 该字段指向条目对应的结构元素(Structure Element)。 详细信息请参考 14.7.2 Structure Hierarchy 章节。 需要注意的是,大多数 PDF 不会包含该字段,而是使用 Dest 或 A 代替。
遗憾的是,pdfminer.six 并不会直接提供目录项所指向的页面编号。 不过,了解了上述字段的含义后,我们可以自己实现一个目录项的页面号解析器:
- SE (
from enum import Enum, auto
from pathlib import Path
from typing import Any, Optional
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdftypes import PDFObjRef
class PDFRefType(Enum):
"""PDF 引用类型。"""
PDF_OBJ_REF = auto()
DICTIONARY = auto()
LIST = auto()
NAMED_REF = auto()
UNK = auto() # 备用类型
class RefPageNumberResolver:
"""PDF 引用到页码解析器。
.. note::
远程跳转(Remote Go-To Actions,见 `https://www.adobe.com/go/pdfreference/`__ 的 12.6.4.3 章节)
不在本解析器的范围内。
属性:
document (:obj:`pdfminer.pdfdocument.PDFDocument`):
包含引用的 PDF 文档。
objid_to_pagenum (:obj:`dict[int, int]`):
从对象 ID 映射到所在的页码。
"""
def __init__(self, document: PDFDocument):
self.document = document
# obj_id -> page_number
self.objid_to_pagenum: dict[int, int] = {
page.pageid: page_num
for page_num, page in enumerate(PDFPage.create_pages(document), 1)
}
@classmethod
def get_ref_type(cls, ref: Any) -> PDFRefType:
"""获取 PDF 引用的类型。"""
if isinstance(ref, PDFObjRef):
return PDFRefType.PDF_OBJ_REF
elif isinstance(ref, dict) and "D" in ref:
return PDFRefType.DICTIONARY
elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
return PDFRefType.LIST
elif isinstance(ref, bytes):
return PDFRefType.NAMED_REF
else:
return PDFRefType.UNK
@classmethod
def is_ref_page(cls, ref: Any) -> bool:
"""检查引用是否指向 `/Page` 类型的对象。
参数:
ref (:obj:`Any`):
PDF 引用。
返回:
:obj:`bool`: 若引用指向页面,则返回 `True`,否则返回 `False`。
"""
return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
def resolve(self, ref: Any) -> Optional[int]:
"""递归解析 PDF 引用到页码。
参数:
ref (:obj:`Any`):
PDF 引用。
返回:
:obj:`Optional[int]`: 解析出的页码,或 `None`(如果无法解析)。
"""
ref_type = self.get_ref_type(ref)
if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
return self.objid_to_pagenum.get(ref.objid)
elif ref_type is PDFRefType.PDF_OBJ_REF:
return self.resolve(ref.resolve())
if ref_type is PDFRefType.DICTIONARY:
return self.resolve(ref["D"])
if ref_type is PDFRefType.LIST:
return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
if ref_type is PDFRefType.NAMED_REF:
return self.resolve(self.document.get_dest(ref))
return None # 无法解析
类 PDFRefType
只是一个辅助工具,用于分类引用类型。
由于一个引用可能指向另一个引用,因此在某些情况下,我们需要递归调用 RefPageNumberResolver.resolve()
,直到最终解析到 Page 对象。
然后,我们可以从 RefPageNumberResolver.objid_to_pagenum 这个字典中获取该对象对应的页码。
使用这个页码解析器,我们可以以可读格式打印 PDF 文档的目录:
def print_outlines(file: str) -> dict[int, int]:
"""格式化打印 PDF 文档的目录(ToC)。"""
with open(file, "rb") as fp:
try:
parser = PDFParser(fp)
document = PDFDocument(parser)
ref_pagenum_resolver = RefPageNumberResolver(document)
outlines = list(document.get_outlines())
if not outlines:
print("未找到目录。")
for (level, title, dest, a, se) in outlines:
if dest:
page_num = ref_pagenum_resolver.resolve(dest)
elif a:
page_num = ref_pagenum_resolver.resolve(a)
elif se:
page_num = ref_pagenum_resolver.resolve(se)
else:
page_num = None
leading_spaces = (level-1) * 4
fill_dots = 80 - len(title) - leading_spaces
print(
f"{' ' * leading_spaces}"
f"{title}",
f"{'.' * fill_dots}",
f"{page_num:>3}"
)
except PDFNoOutlines:
print("未找到目录。")
except PDFSyntaxError:
print("PDF 文件损坏或不是 PDF 文件。")
if __name__ == "__main__":
file_name = Path("...")
print_outlines(file_name)
pdfminer.six allows to access the Table of Contents (or “Outlines” as called in
the PDF internal structure) of a document through the method
PDFDocument.get_outlines()
.
A minimal example would be:
from pathlib import Path
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
file_name = Path("...")
with open(file_name, "rb") as fp:
try:
parser = PDFParser(fp)
document = PDFDocument(parser)
outlines = document.get_outlines()
for (level, title, dest, a, se) in outlines:
... # do something
except PDFNoOutlines:
print("No outlines found.")
except PDFSyntaxError:
print("Corrupted PDF or non-PDF file.")
finally:
parser.close()
But what do the different fields of each outline entry mean? To answer this question we can refer to the section 12.3.3 Document Outline of the PDF Reference:
- Level (
int
): This is, unsurprisingly, the level at which the entry is. Entries at the top level will have level
1
. Entries nested within those ones (i.e., their children), will have level2
, and so on.
- Level (
- Title (
str
): Again, quite self-explanatory, this field contains the name of the entry. For example: “1. Introduction”.
- Title (
- Dest (
Union[list, bytes]
, optional): This is where things start to get interesting. First thing to mention is that if a Dest entry is present, the A entry shall not be present. Both of them allow to specify the object the entry targets (this could be a page or any other object). Destinations can be specified in multiple ways. In order to not paraphrase what the PDF Reference states, we refer the reader to the chapter 12.3.2 Destinations for more information on the topic.
- Dest (
- A (
pdfminer.pdftypes.PDFObjRef
, optional): Alternatively to using a destination, the target of an entry can also be specified as an action. Again, actions can get somewhat complicated, so we refer the reader to the chapter 12.6 Actions of the reference.
- A (
- SE (
pdfminer.pdftypes.PDFObjRef
, optional): This field contains the structure element the entry points at. More information about structure elements can be found in the chapter 14.7.2 Structure Hierarchy. It is worth mentioning that most PDFs will not include this field, using Dest or A instead, or if they do, they might still include a destination (Dest) to keep compatibility with PDF versions previous to 1.3.
Unfortunately, pdfminer.six doesn’t expose the page number that each of the entries targets. However, once we know what each of the fields above mean, we can implement a ToC-entry page number resolver ourselves:
- SE (
from enum import Enum, auto
from pathlib import Path
from typing import Any, Optional
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdftypes import PDFObjRef
class PDFRefType(Enum):
"""PDF reference type."""
PDF_OBJ_REF = auto()
DICTIONARY = auto()
LIST = auto()
NAMED_REF = auto()
UNK = auto() # fallback
class RefPageNumberResolver:
"""PDF Reference to page number resolver.
.. note::
Remote Go-To Actions (see 12.6.4.3 in
`https://www.adobe.com/go/pdfreference/`__)
are out of the scope of this resolver.
Attributes:
document (:obj:`pdfminer.pdfdocument.PDFDocument`):
The document that contains the references.
objid_to_pagenum (:obj:`dict[int, int]`):
Mapping from an object id to the number of the page that contains
that object.
"""
def __init__(self, document: PDFDocument):
self.document = document
# obj_id -> page_number
self.objid_to_pagenum: dict[int, int] = {
page.pageid: page_num
for page_num, page in enumerate(PDFPage.create_pages(document), 1)
}
@classmethod
def get_ref_type(cls, ref: Any) -> PDFRefType:
"""Get the type of a PDF reference."""
if isinstance(ref, PDFObjRef):
return PDFRefType.PDF_OBJ_REF
elif isinstance(ref, dict) and "D" in ref:
return PDFRefType.DICTIONARY
elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
return PDFRefType.LIST
elif isinstance(ref, bytes):
return PDFRefType.NAMED_REF
else:
return PDFRefType.UNK
@classmethod
def is_ref_page(cls, ref: Any) -> bool:
"""Check whether a reference is of type '/Page'.
Args:
ref (:obj:`Any`):
The PDF reference.
Returns:
:obj:`bool`: :obj:`True` if the reference references
a page, :obj:`False` otherwise.
"""
return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
def resolve(self, ref: Any) -> Optional[int]:
"""Resolve a PDF reference to a page number recursively.
Args:
ref (:obj:`Any`):
The PDF reference.
Returns:
:obj:`Optional[int]`: The page number or :obj:`None`
if the reference could not be resolved (e.g., remote Go-To
Actions or malformed references).
"""
ref_type = self.get_ref_type(ref)
if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
return self.objid_to_pagenum.get(ref.objid)
elif ref_type is PDFRefType.PDF_OBJ_REF:
return self.resolve(ref.resolve())
if ref_type is PDFRefType.DICTIONARY:
return self.resolve(ref["D"])
if ref_type is PDFRefType.LIST:
# Get the PDFObjRef in the list (usually first element).
return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
if ref_type is PDFRefType.NAMED_REF:
return self.resolve(self.document.get_dest(ref))
return None # PDFRefType.UNK
The class PDFRefType
is just a helper to categorize the type of
reference we are dealing with. Due to the fact that a reference can point to
another reference, in some cases we will have to recursively call
RefPageNumberResolver.resolve()
until we finally reach a page object.
Then, we can get the page number by accessing the dictionary
RefPageNumberResolver.objid_to_pagenum
, which maps the page object id to
the page number.
Using this page number resolver, we can for example print the Table of Contents of a document in a human-readable format with the following code:
def print_outlines(file: str) -> dict[int, int]:
"""Pretty print the outlines (ToC) of a PDF document."""
with open(file, "rb") as fp:
try:
parser = PDFParser(fp)
document = PDFDocument(parser)
ref_pagenum_resolver = RefPageNumberResolver(document)
outlines = list(document.get_outlines())
if not outlines:
print("No outlines found.")
for (level, title, dest, a, se) in outlines:
if dest:
page_num = ref_pagenum_resolver.resolve(dest)
elif a:
page_num = ref_pagenum_resolver.resolve(a)
elif se:
page_num = ref_pagenum_resolver.resolve(se)
else:
page_num = None
# Calculate leading spaces and filling dots for formatting.
leading_spaces = (level-1) * 4
fill_dots = 80 - len(title) - leading_spaces
print(
f"{' ' * leading_spaces}"
f"{title}",
f"{'.' * fill_dots}",
f"{page_num:>3}"
)
except PDFNoOutlines:
print("No outlines found.")
except PDFSyntaxError:
print("Corrupted PDF or non-PDF file.")
finally:
try:
parser.close()
except NameError:
pass # nothing to do
def main():
file_name = Path("...")
print_outlines(file_name)
if __name__ == "__main__":
main()