pypdf._text_extraction 源代码

"""
Code related to text extraction.

Some parts are still in _page.py. In doubt, they will stay there.
"""

import math
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding

CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5


class OrientationNotFoundError(Exception):
    pass


def set_custom_rtl(
    _min: Union[str, int, None] = None,
    _max: Union[str, int, None] = None,
    specials: Union[str, List[int], None] = None,
) -> Tuple[int, int, List[int]]:
    """
    Change the Right-To-Left and special characters custom parameters.

    Args:
        _min: The new minimum value for the range of custom characters that
            will be written right to left.
            If set to ``None``, the value will not be changed.
            If set to an integer or string, it will be converted to its ASCII code.
            The default value is -1, which sets no additional range to be converted.
        _max: The new maximum value for the range of custom characters that will
            be written right to left.
            If set to ``None``, the value will not be changed.
            If set to an integer or string, it will be converted to its ASCII code.
            The default value is -1, which sets no additional range to be converted.
        specials: The new list of special characters to be inserted in the
            current insertion order.
            If set to ``None``, the current value will not be changed.
            If set to a string, it will be converted to a list of ASCII codes.
            The default value is an empty list.

    Returns:
        A tuple containing the new values for ``CUSTOM_RTL_MIN``,
        ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.

    """
    global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
    if isinstance(_min, int):
        CUSTOM_RTL_MIN = _min
    elif isinstance(_min, str):
        CUSTOM_RTL_MIN = ord(_min)
    if isinstance(_max, int):
        CUSTOM_RTL_MAX = _max
    elif isinstance(_max, str):
        CUSTOM_RTL_MAX = ord(_max)
    if isinstance(specials, str):
        CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
    elif isinstance(specials, list):
        CUSTOM_RTL_SPECIAL_CHARS = specials
    return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS


[文档] def mult(m: List[float], n: List[float]) -> List[float]: return [ m[0] * n[0] + m[1] * n[2], m[0] * n[1] + m[1] * n[3], m[2] * n[0] + m[3] * n[2], m[2] * n[1] + m[3] * n[3], m[4] * n[0] + m[5] * n[2] + n[4], m[4] * n[1] + m[5] * n[3] + n[5], ]
def orient(m: List[float]) -> int: if m[3] > 1e-6: return 0 elif m[3] < -1e-6: return 180 elif m[1] > 0: return 90 else: return 270 def crlf_space_check( text: str, cmtm_prev: Tuple[List[float], List[float]], cmtm_matrix: Tuple[List[float], List[float]], memo_cmtm: Tuple[List[float], List[float]], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], orientations: Tuple[int, ...], output: str, font_size: float, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], str_widths: float, spacewidth: float, str_height: float, ) -> Tuple[str, str, List[float], List[float]]: cm_prev = cmtm_prev[0] tm_prev = cmtm_prev[1] cm_matrix = cmtm_matrix[0] tm_matrix = cmtm_matrix[1] memo_cm = memo_cmtm[0] memo_tm = memo_cmtm[1] m_prev = mult(tm_prev, cm_prev) m = mult(tm_matrix, cm_matrix) orientation = orient(m) delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] # Table 108 of the 1.7 reference ("Text positioning operators") scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) cm_prev = m if orientation not in orientations: raise OrientationNotFoundError if orientation in (0, 180): moved_height: float = delta_y moved_width: float = delta_x elif orientation in (90, 270): moved_height = delta_x moved_width = delta_y try: if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y): if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( text + "\n", memo_cm, memo_tm, cmap[3], font_size, ) text = "" elif ( (moved_width >= (spacewidth + str_widths) * scale_prev_x) and (output + text)[-1] != " " ): text += " " except Exception: pass tm_prev = tm_matrix.copy() cm_prev = cm_matrix.copy() return text, output, cm_prev, tm_prev def get_text_operands( operands: List[Union[str, TextStringObject]], cm_matrix: List[float], tm_matrix: List[float], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], orientations: Tuple[int, ...] ) -> Tuple[str, bool]: t: str = "" is_str_operands = False m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): t = operands[0] is_str_operands = True else: t = "" tt: bytes = ( encode_pdfdocencoding(operands[0]) if isinstance(operands[0], str) else operands[0] ) if isinstance(cmap[0], str): try: t = tt.decode(cmap[0], "surrogatepass") # apply str encoding except Exception: # the data does not match the expectation, # we use the alternative ; # text extraction may not be good t = tt.decode( "utf-16-be" if cmap[0] == "charmap" else "charmap", "surrogatepass", ) # apply str encoding else: # apply dict encoding t = "".join( [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] ) return (t, is_str_operands) def get_display_str( text: str, cm_matrix: List[float], tm_matrix: List[float], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], text_operands: str, font_size: float, rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] ) -> Tuple[str, bool]: # "\u0590 - \u08FF \uFB50 - \uFDFF" for x in [cmap[1].get(x, x) for x in text_operands]: # x can be a sequence of bytes ; ex: habibi.pdf if len(x) == 1: xx = ord(x) else: xx = 1 # fmt: off if ( # cases where the current inserting order is kept (xx <= 0x2F) # punctuations but... or 0x3A <= xx <= 0x40 # numbers (x30-39) or 0x2000 <= xx <= 0x206F # upper punctuations.. or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): text = x + text if rtl_dir else text + x elif ( # right-to-left characters set 0x0590 <= xx <= 0x08FF or 0xFB1D <= xx <= 0xFDFF or 0xFE70 <= xx <= 0xFEFF or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX ): if not rtl_dir: rtl_dir = True if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = x + text else: # left-to-right if rtl_dir: rtl_dir = False if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = text + x # fmt: on return text, rtl_dir