Pix2Text

Classes

`Pix2Text`

Methods

`init(self, *, layout_parser=None, text_formula_ocr=None, table_ocr=None, **kwargs)` `special`

Initialize the Pix2Text object.

Parameters:

Name	Type	Description	Default
`layout_parser`	`LayoutParser`	The layout parser object; default value is `None`, which means to create a default one	`None`
`text_formula_ocr`	`TextFormulaOCR`	The text and formula OCR object; default value is `None`, which means to create a default one	`None`
`table_ocr`	`TableOCR`	The table OCR object; default value is `None`, which means not to recognize tables	`None`
`**kwargs`	`dict`	Other arguments, currently not used	`{}`

Source code in pix2text/pix_to_text.py

def __init__(
    self,
    *,
    layout_parser: Optional[LayoutParser] = None,
    text_formula_ocr: Optional[TextFormulaOCR] = None,
    table_ocr: Optional[TableOCR] = None,
    **kwargs,
):
    """
    Initialize the Pix2Text object.
    Args:
        layout_parser (LayoutParser): The layout parser object; default value is `None`, which means to create a default one
        text_formula_ocr (TextFormulaOCR): The text and formula OCR object; default value is `None`, which means to create a default one
        table_ocr (TableOCR): The table OCR object; default value is `None`, which means not to recognize tables
        **kwargs (dict): Other arguments, currently not used
    """
    if layout_parser is None:
        device = select_device(None)
        # layout_parser = LayoutParser.from_config(None, device=device)
        layout_parser = DocXLayoutParser.from_config(None, device=device)
    if text_formula_ocr is None:
        device = select_device(None)
        text_formula_ocr = TextFormulaOCR.from_config(
            None, enable_formula=True, device=device
        )
    self.layout_parser = layout_parser
    self.text_formula_ocr = text_formula_ocr
    self.table_ocr = table_ocr

`from_config(total_configs=None, enable_formula=True, enable_table=True, device=None, **kwargs)` `classmethod`

Create a Pix2Text object from the configuration.

Parameters:

Name	Type	Description	Default
`total_configs`	`dict`	The total configuration; default value is `None`, which means to use the default configuration. If not None, it should contain the following keys: * `layout`: The layout parser configuration * `text_formula`: The TextFormulaOCR configuration * `table`: The table OCR configuration	`None`
`enable_formula`	`bool`	Whether to enable formula recognition; default value is `True`	`True`
`enable_table`	`bool`	Whether to enable table recognition; default value is `True`	`True`
`device`	`str`	The device to run the model; optional values are 'cpu', 'gpu' or 'cuda'; default value is `None`, which means to select the device automatically	`None`
`**kwargs`	`dict`	Other arguments	`{}`

Source code in pix2text/pix_to_text.py

@classmethod
def from_config(
    cls,
    total_configs: Optional[dict] = None,
    enable_formula: bool = True,
    enable_table: bool = True,
    device: str = None,
    **kwargs,
):
    """
    Create a Pix2Text object from the configuration.
    Args:
        total_configs (dict): The total configuration; default value is `None`, which means to use the default configuration.
            If not None, it should contain the following keys:

                * `layout`: The layout parser configuration
                * `text_formula`: The TextFormulaOCR configuration
                * `table`: The table OCR configuration
        enable_formula (bool): Whether to enable formula recognition; default value is `True`
        enable_table (bool): Whether to enable table recognition; default value is `True`
        device (str): The device to run the model; optional values are 'cpu', 'gpu' or 'cuda';
            default value is `None`, which means to select the device automatically
        **kwargs (dict): Other arguments

    Returns: a Pix2Text object

    """
    total_configs = total_configs or {}
    layout_config = total_configs.get('layout', None)
    text_formula_config = total_configs.get('text_formula', None)
    table_config = total_configs.get('table', None)

    # layout_parser = LayoutParser.from_config(layout_config, device=device)
    layout_parser = DocXLayoutParser.from_config(layout_config, device=device)
    text_formula_ocr = TextFormulaOCR.from_config(
        text_formula_config, enable_formula=enable_formula, device=device
    )
    if enable_table:
        table_ocr = TableOCR.from_config(
            text_formula_ocr.text_ocr,
            text_formula_ocr.spellchecker,
            table_config,
            device=device,
        )
    else:
        table_ocr = None

    return cls(
        layout_parser=layout_parser,
        text_formula_ocr=text_formula_ocr,
        table_ocr=table_ocr,
        **kwargs,
    )

`recognize(self, img, file_type='text_formula', **kwargs)`

Recognize the content of the image or pdf file according to the specified type. It will call the corresponding recognition function .recognize_{img_type}() according to the img_type.

Parameters:

Name	Type	Description	Default
`img`	`Union[str, Path, Image.Image]`	The image/pdf file path or `Image.Image` object	required
`file_type`	`str`	Supported file types: 'pdf', 'page', 'text_formula', 'formula', 'text'	`'text_formula'`
`**kwargs`	`dict`	Arguments for the corresponding recognition function	`{}`

Source code in pix2text/pix_to_text.py

def recognize(
    self,
    img: Union[str, Path, Image.Image],
    file_type: Literal[
        'pdf', 'page', 'text_formula', 'formula', 'text'
    ] = 'text_formula',
    **kwargs,
) -> Union[Document, Page, str, List[str], List[Any], List[List[Any]]]:
    """
    Recognize the content of the image or pdf file according to the specified type.
    It will call the corresponding recognition function `.recognize_{img_type}()` according to the `img_type`.
    Args:
        img (Union[str, Path, Image.Image]): The image/pdf file path or `Image.Image` object
        file_type (str):  Supported file types: 'pdf', 'page', 'text_formula', 'formula', 'text'
        **kwargs (dict): Arguments for the corresponding recognition function

    Returns: recognized results

    """
    rec_func = getattr(self, f'recognize_{file_type}', None)
    if rec_func is None:
        raise ValueError(f'Unsupported file type: {file_type}')
    return rec_func(img, **kwargs)

`recognize_formula(self, imgs, batch_size=1, return_text=True, rec_config=None, **kwargs)`

Recognize pure Math Formula images to LaTeX Expressions

Parameters:

Name	Type	Description	Default
`imgs`	`Union[str, Path, Image.Image, List[str], List[Path], List[Image.Image]`	The image or list of images	required
`batch_size`	`int`	The batch size	`1`
`return_text`	`bool`	Whether to return only the recognized text; default value is `True`	`True`
`rec_config`	`Optional[dict]`	The config for recognition	`None`
`**kwargs`		Special model parameters. Not used for now	`{}`

The LaTeX Expression or list of LaTeX Expressions;

str or List[str] when return_text is True; Dict[str, Any] or List[Dict[str, Any]] when return_text is False, with the following keys:

* `text`: The recognized LaTeX text
* `score`: The confidence score [0, 1]; the higher, the more confident

Source code in pix2text/pix_to_text.py

def recognize_formula(
    self,
    imgs: Union[str, Path, Image.Image, List[str], List[Path], List[Image.Image]],
    batch_size: int = 1,
    return_text: bool = True,
    rec_config: Optional[dict] = None,
    **kwargs,
) -> Union[str, List[str], Dict[str, Any], List[Dict[str, Any]]]:
    """
    Recognize pure Math Formula images to LaTeX Expressions
    Args:
        imgs (Union[str, Path, Image.Image, List[str], List[Path], List[Image.Image]): The image or list of images
        batch_size (int): The batch size
        return_text (bool): Whether to return only the recognized text; default value is `True`
        rec_config (Optional[dict]): The config for recognition
        **kwargs (): Special model parameters. Not used for now

    Returns: The LaTeX Expression or list of LaTeX Expressions;
        str or List[str] when `return_text` is True;
        Dict[str, Any] or List[Dict[str, Any]] when `return_text` is False, with the following keys:

            * `text`: The recognized LaTeX text
            * `score`: The confidence score [0, 1]; the higher, the more confident

    """
    return self.text_formula_ocr.recognize_formula(
        imgs, batch_size, return_text, rec_config, **kwargs
    )

`recognize_page(self, img, page_number=0, page_id=None, **kwargs)`

Analyze the layout of the image, and then recognize the information contained in each section.

Parameters:

Name	Type	Description	Default
`img`	`str or Image.Image`	an image path, or `Image.Image` loaded by `Image.open()`	required
`page_number`	`str`	page number; default value is `0`	`0`
`page_id`	`str`	page id; default value is `None`, which means to use the `str(page_number)`	`None`
`kwargs`		resized_shape (int): Resize the image width to this size for processing; default value is `768` mfr_batch_size (int): batch size for MFR; When running on GPU, this value is suggested to be set to greater than 1; default value is `1` embed_sep (tuple): Prefix and suffix for embedding latex; only effective when `return_text` is `True`; default value is `(' $', '$ ')` isolated_sep (tuple): Prefix and suffix for isolated latex; only effective when `return_text` is `True`; default value is two-dollar signs line_sep (str): The separator between lines of text; only effective when `return_text` is `True`; default value is a line break auto_line_break (bool): Automatically line break the recognized text; only effective when `return_text` is `True`; default value is `True` det_text_bbox_max_width_expand_ratio (float): Expand the width of the detected text bbox. This value represents the maximum expansion ratio above and below relative to the original bbox height; default value is `0.3` det_text_bbox_max_height_expand_ratio (float): Expand the height of the detected text bbox. This value represents the maximum expansion ratio above and below relative to the original bbox height; default value is `0.2` embed_ratio_threshold (float): The overlap threshold for embed formulas and text lines; default value is `0.6`. When the overlap between an embed formula and a text line is greater than or equal to this threshold, the embed formula and the text line are considered to be on the same line; otherwise, they are considered to be on different lines. table_as_image (bool): If `True`, the table will be recognized as an image (don't parse the table content as text) ; default value is `False` title_contain_formula (bool): If `True`, the title of the page will be recognized as a mixed image (text and formula). If `False`, it will be recognized as a text; default value is `False` text_contain_formula (bool): If `True`, the text of the page will be recognized as a mixed image (text and formula). If `False`, it will be recognized as a text; default value is `True` formula_rec_kwargs (dict): generation arguments passed to formula recognizer `latex_ocr`; default value is `{}` save_debug_res (str): if `save_debug_res` is set, the directory to save the debug results; default value is `None`, which means not to save	`{}`

Source code in pix2text/pix_to_text.py

def recognize_page(
    self,
    img: Union[str, Path, Image.Image],
    page_number: int = 0,
    page_id: Optional[str] = None,
    **kwargs,
) -> Page:
    """
    Analyze the layout of the image, and then recognize the information contained in each section.

    Args:
        img (str or Image.Image): an image path, or `Image.Image` loaded by `Image.open()`
        page_number (str): page number; default value is `0`
        page_id (str): page id; default value is `None`, which means to use the `str(page_number)`
        kwargs ():
            * resized_shape (int): Resize the image width to this size for processing; default value is `768`
            * mfr_batch_size (int): batch size for MFR; When running on GPU, this value is suggested to be set to greater than 1; default value is `1`
            * embed_sep (tuple): Prefix and suffix for embedding latex; only effective when `return_text` is `True`; default value is `(' $', '$ ')`
            * isolated_sep (tuple): Prefix and suffix for isolated latex; only effective when `return_text` is `True`; default value is two-dollar signs
            * line_sep (str): The separator between lines of text; only effective when `return_text` is `True`; default value is a line break
            * auto_line_break (bool): Automatically line break the recognized text; only effective when `return_text` is `True`; default value is `True`
            * det_text_bbox_max_width_expand_ratio (float): Expand the width of the detected text bbox. This value represents the maximum expansion ratio above and below relative to the original bbox height; default value is `0.3`
            * det_text_bbox_max_height_expand_ratio (float): Expand the height of the detected text bbox. This value represents the maximum expansion ratio above and below relative to the original bbox height; default value is `0.2`
            * embed_ratio_threshold (float): The overlap threshold for embed formulas and text lines; default value is `0.6`.
                When the overlap between an embed formula and a text line is greater than or equal to this threshold,
                the embed formula and the text line are considered to be on the same line;
                otherwise, they are considered to be on different lines.
            * table_as_image (bool): If `True`, the table will be recognized as an image (don't parse the table content as text) ; default value is `False`
            * title_contain_formula (bool): If `True`, the title of the page will be recognized as a mixed image (text and formula). If `False`, it will be recognized as a text; default value is `False`
            * text_contain_formula (bool): If `True`, the text of the page will be recognized as a mixed image (text and formula). If `False`, it will be recognized as a text; default value is `True`
            * formula_rec_kwargs (dict): generation arguments passed to formula recognizer `latex_ocr`; default value is `{}`
            * save_debug_res (str): if `save_debug_res` is set, the directory to save the debug results; default value is `None`, which means not to save

    Returns: a Page object. Use `page.to_markdown('output-dir')` to get the markdown output of the recognized page.
    """
    if isinstance(img, Image.Image):
        img0 = img.convert('RGB')
    else:
        img0 = read_img(img, return_type='Image')

    page_id = page_id or str(page_number)
    kwargs['embed_sep'] = kwargs.get('embed_sep', (' $', '$ '))
    kwargs['isolated_sep'] = kwargs.get('isolated_sep', ('$$\n', '\n$$'))
    kwargs['line_sep'] = kwargs.get('line_sep', '\n')
    kwargs['auto_line_break'] = kwargs.get('auto_line_break', True)
    kwargs['title_contain_formula'] = kwargs.get('title_contain_formula', False)
    kwargs['text_contain_formula'] = kwargs.get('text_contain_formula', True)
    resized_shape = kwargs.get('resized_shape', 768)
    kwargs['resized_shape'] = resized_shape
    layout_kwargs = deepcopy(kwargs)
    layout_kwargs['resized_shape'] = resized_shape
    layout_kwargs['table_as_image'] = kwargs.get('table_as_image', False)
    layout_out, column_meta = self.layout_parser.parse(
        img0.copy(), **layout_kwargs,
    )

    debug_dir = None
    if kwargs.get('save_debug_res', None):
        debug_dir = Path(kwargs.get('save_debug_res'))
        debug_dir.mkdir(exist_ok=True, parents=True)

    outs = []
    for _id, box_info in enumerate(layout_out):
        image_type = box_info['type']
        if image_type == ElementType.IGNORED:
            continue
        box = box2list(box_info['position'])
        crop_patch = img0.crop(box)
        crop_width, _ = crop_patch.size
        score = 1.0
        if image_type in (ElementType.TEXT, ElementType.TITLE):
            _resized_shape = resized_shape
            while crop_width > 1.5 * _resized_shape and _resized_shape < 2048:
                _resized_shape = min(int(1.5 * _resized_shape), 2048)
            padding_patch = add_img_margin(
                crop_patch, left_right_margin=30, top_bottom_margin=30
            )
            text_formula_kwargs = deepcopy(kwargs)
            text_formula_kwargs['resized_shape'] = _resized_shape
            text_formula_kwargs['save_analysis_res'] = (
                debug_dir / f'{_id}-{image_type.name}.png' if debug_dir else None
            )
            if image_type == ElementType.TITLE:
                text_formula_kwargs['contain_formula'] = kwargs[
                    'title_contain_formula'
                ]
            if image_type == ElementType.TEXT:
                text_formula_kwargs['contain_formula'] = kwargs[
                    'text_contain_formula'
                ]
            text_formula_kwargs['return_text'] = False
            _out = self.text_formula_ocr.recognize(
                padding_patch, **text_formula_kwargs,
            )
            text, meta = None, _out
            score = float(np.mean([x['score'] for x in _out]))
        elif image_type == ElementType.TABLE:
            xmin, ymin, xmax, ymax = box
            img_width, img_height = img0.size
            table_expansion_margin = 10
            xmin, ymin = (
                max(0, xmin - table_expansion_margin),
                max(0, ymin - table_expansion_margin),
            )
            xmax, ymax = (
                min(img_width, xmax + table_expansion_margin),
                min(img_height, ymax + table_expansion_margin),
            )
            box = (xmin, ymin, xmax, ymax)
            crop_patch = img0.crop(box)
            save_analysis_res = (
                debug_dir / f'{_id}-{image_type.name}.png' if debug_dir else None
            )
            table_kwargs = deepcopy(kwargs)
            table_kwargs['save_analysis_res'] = save_analysis_res
            _out = self.table_ocr.recognize(
                crop_patch,
                out_cells=True,
                out_markdown=True,
                out_html=True,
                **table_kwargs,
            )
            text, meta = None, _out
        elif image_type == ElementType.FORMULA:
            formula_kwargs = deepcopy(kwargs)
            formula_kwargs['return_text'] = False
            _out = self.text_formula_ocr.recognize_formula(
                crop_patch, **formula_kwargs
            )
            score = _out['score']
            text, meta = None, _out
        elif image_type == ElementType.FIGURE:
            text, meta = '', None
        else:
            image_type = ElementType.UNKNOWN
            text, meta = '', None

        outs.append(
            Element(
                id=f'{page_id}-{_id}',
                box=box,
                meta=meta,
                text=text,
                isolated=box_info['isolated'],
                col_number=box_info['col_number'],
                type=image_type,
                score=score,
                total_img=img0,
                spellchecker=self.text_formula_ocr.spellchecker,
                configs=kwargs,
            )
        )

    remaining_blocks = self._parse_remaining(
        img0, layout_out, column_meta, debug_dir, **kwargs
    )
    for box_info in remaining_blocks:
        outs.append(
            Element(
                id=f'{page_id}-{len(outs)}-remaining',
                box=box2list(box_info['position']),
                meta=None,
                text=box_info['text'],
                isolated=False,
                col_number=box_info['col_number'],
                type=ElementType.TEXT
                if box_info['type'] != 'isolated'
                else ElementType.FORMULA,
                score=box_info['score'],
                total_img=img0,
                spellchecker=self.text_formula_ocr.spellchecker,
                configs=kwargs,
            )
        )
    return Page(
        number=page_number,
        id=page_id,
        elements=outs,
        spellchecker=self.text_formula_ocr.spellchecker,
        config=kwargs,
    )

`recognize_pdf(self, pdf_fp, pdf_number=0, pdf_id=None, page_numbers=None, **kwargs)`

recognize a pdf file

Parameters:

Name	Type	Description	Default
`pdf_fp`	`Union[str, Path]`	pdf file path	required
`pdf_number`	`int`	pdf number	`0`
`pdf_id`	`str`	pdf id	`None`
`page_numbers`	`List[int]`	page numbers to recognize; default is `None`, which means to recognize all pages	`None`
`kwargs`	`dict`	Optional keyword arguments. The same as `recognize_page`	`{}`

Source code in pix2text/pix_to_text.py

def recognize_pdf(
    self,
    pdf_fp: Union[str, Path],
    pdf_number: int = 0,
    pdf_id: Optional[str] = None,
    page_numbers: Optional[List[int]] = None,
    **kwargs,
) -> Document:
    """
    recognize a pdf file
    Args:
        pdf_fp (Union[str, Path]): pdf file path
        pdf_number (int): pdf number
        pdf_id (str): pdf id
        page_numbers (List[int]): page numbers to recognize; default is `None`, which means to recognize all pages
        kwargs (dict): Optional keyword arguments. The same as `recognize_page`

    Returns: a Document object. Use `doc.to_markdown('output-dir')` to get the markdown output of the recognized document.

    """
    pdf_id = pdf_id or str(pdf_number)

    doc = fitz.open(pdf_fp, filetype='pdf')
    if page_numbers is None:
        page_numbers = list(range(len(doc)))
    outs = []
    for page_num in range(len(doc)):
        if page_num not in page_numbers:
            continue
        page = doc.load_page(page_num)
        # convert to image
        pix = page.get_pixmap(dpi=300)
        # convert the pixmap to bytes
        img_data = pix.tobytes(output='jpg', jpg_quality=200)
        # Create a PIL Image from the raw image data
        image = Image.open(io.BytesIO(img_data)).convert('RGB')
        page_id = str(page_num)
        page_kwargs = deepcopy(kwargs)
        if kwargs.get('save_debug_res'):
            page_kwargs['save_debug_res'] = os.path.join(
                kwargs['save_debug_res'], f'{pdf_id}-{page_id}'
            )
        outs.append(
            self.recognize_page(
                image, page_number=page_num, page_id=page_id, **page_kwargs
            )
        )
    return Document(
        number=pdf_number,
        id=pdf_id,
        pages=outs,
        spellchecker=self.text_formula_ocr.spellchecker,
        config=kwargs,
    )

`recognize_text(self, imgs, return_text=True, rec_config=None, **kwargs)`

Recognize a pure Text Image.

Parameters:

Name	Type	Description	Default
`imgs`	`Union[str, Path, Image.Image], List[str], List[Path], List[Image.Image]`	The image or list of images	required
`return_text`	`bool`	Whether to return only the recognized text; default value is `True`	`True`
`rec_config`	`Optional[dict]`	The config for recognition	`None`
`kwargs`		Other parameters for `text_ocr.ocr()`	`{}`

Text str or list of text strs when return_text is True;

List[Any] or List[List[Any]] when return_text is False, with the same length as imgs and the following keys:

* `position`: Position information of the block, `np.ndarray`, with a shape of [4, 2]
* `text`: The recognized text
* `score`: The confidence score [0, 1]; the higher, the more confident

Source code in pix2text/pix_to_text.py

def recognize_text(
    self,
    imgs: Union[str, Path, Image.Image, List[str], List[Path], List[Image.Image]],
    return_text: bool = True,
    rec_config: Optional[dict] = None,
    **kwargs,
) -> Union[str, List[str], List[Any], List[List[Any]]]:
    """
    Recognize a pure Text Image.
    Args:
        imgs (Union[str, Path, Image.Image], List[str], List[Path], List[Image.Image]): The image or list of images
        return_text (bool): Whether to return only the recognized text; default value is `True`
        rec_config (Optional[dict]): The config for recognition
        kwargs (): Other parameters for `text_ocr.ocr()`

    Returns: Text str or list of text strs when `return_text` is True;
        `List[Any]` or `List[List[Any]]` when `return_text` is False, with the same length as `imgs` and the following keys:

            * `position`: Position information of the block, `np.ndarray`, with a shape of [4, 2]
            * `text`: The recognized text
            * `score`: The confidence score [0, 1]; the higher, the more confident

    """
    return self.text_formula_ocr.recognize_text(
        imgs, return_text, rec_config, **kwargs
    )

`recognize_text_formula(self, img, return_text=True, **kwargs)`

Analyze the layout of the image, and then recognize the information contained in each section.

Parameters:

Name	Type	Description	Default
`img`	`str or Image.Image`	an image path, or `Image.Image` loaded by `Image.open()`	required
`return_text`	`bool`	Whether to return the recognized text; default value is `True`	`True`
`kwargs`		resized_shape (int): Resize the image width to this size for processing; default value is `768` save_analysis_res (str): Save the mfd result image in this file; default is `None`, which means not to save mfr_batch_size (int): batch size for MFR; When running on GPU, this value is suggested to be set to greater than 1; default value is `1` embed_sep (tuple): Prefix and suffix for embedding latex; only effective when `return_text` is `True`; default value is `(' $', '$ ')` isolated_sep (tuple): Prefix and suffix for isolated latex; only effective when `return_text` is `True`; default value is two-dollar signs line_sep (str): The separator between lines of text; only effective when `return_text` is `True`; default value is a line break auto_line_break (bool): Automatically line break the recognized text; only effective when `return_text` is `True`; default value is `True` det_text_bbox_max_width_expand_ratio (float): Expand the width of the detected text bbox. This value represents the maximum expansion ratio above and below relative to the original bbox height; default value is `0.3` det_text_bbox_max_height_expand_ratio (float): Expand the height of the detected text bbox. This value represents the maximum expansion ratio above and below relative to the original bbox height; default value is `0.2` embed_ratio_threshold (float): The overlap threshold for embed formulas and text lines; default value is `0.6`. When the overlap between an embed formula and a text line is greater than or equal to this threshold, the embed formula and the text line are considered to be on the same line; otherwise, they are considered to be on different lines. table_as_image (bool): If `True`, the table will be recognized as an image; default value is `False` formula_rec_kwargs (dict): generation arguments passed to formula recognizer `latex_ocr`; default value is `{}`	`{}`

a str when return_text is True; or a list of ordered (top to bottom, left to right) dicts when return_text is False,

with each dict representing one detected box, containing keys:

type: The category of the image; Optional: 'text', 'isolated', 'embedding'
text: The recognized text or Latex formula
score: The confidence score [0, 1]; the higher, the more confident
position: Position information of the block, np.ndarray, with shape of [4, 2]
line_number: The line number of the box (first line line_number==0), boxes with the same value indicate they are on the same line

Source code in pix2text/pix_to_text.py

def recognize_text_formula(
    self, img: Union[str, Path, Image.Image], return_text: bool = True, **kwargs,
) -> Union[str, List[str], List[Any], List[List[Any]]]:
    """
    Analyze the layout of the image, and then recognize the information contained in each section.

    Args:
        img (str or Image.Image): an image path, or `Image.Image` loaded by `Image.open()`
        return_text (bool): Whether to return the recognized text; default value is `True`
        kwargs ():
            * resized_shape (int): Resize the image width to this size for processing; default value is `768`
            * save_analysis_res (str): Save the mfd result image in this file; default is `None`, which means not to save
            * mfr_batch_size (int): batch size for MFR; When running on GPU, this value is suggested to be set to greater than 1; default value is `1`
            * embed_sep (tuple): Prefix and suffix for embedding latex; only effective when `return_text` is `True`; default value is `(' $', '$ ')`
            * isolated_sep (tuple): Prefix and suffix for isolated latex; only effective when `return_text` is `True`; default value is two-dollar signs
            * line_sep (str): The separator between lines of text; only effective when `return_text` is `True`; default value is a line break
            * auto_line_break (bool): Automatically line break the recognized text; only effective when `return_text` is `True`; default value is `True`
            * det_text_bbox_max_width_expand_ratio (float): Expand the width of the detected text bbox. This value represents the maximum expansion ratio above and below relative to the original bbox height; default value is `0.3`
            * det_text_bbox_max_height_expand_ratio (float): Expand the height of the detected text bbox. This value represents the maximum expansion ratio above and below relative to the original bbox height; default value is `0.2`
            * embed_ratio_threshold (float): The overlap threshold for embed formulas and text lines; default value is `0.6`.
                When the overlap between an embed formula and a text line is greater than or equal to this threshold,
                the embed formula and the text line are considered to be on the same line;
                otherwise, they are considered to be on different lines.
            * table_as_image (bool): If `True`, the table will be recognized as an image; default value is `False`
            * formula_rec_kwargs (dict): generation arguments passed to formula recognizer `latex_ocr`; default value is `{}`

    Returns: a str when `return_text` is `True`; or a list of ordered (top to bottom, left to right) dicts when `return_text` is `False`,
        with each dict representing one detected box, containing keys:

           * `type`: The category of the image; Optional: 'text', 'isolated', 'embedding'
           * `text`: The recognized text or Latex formula
           * `score`: The confidence score [0, 1]; the higher, the more confident
           * `position`: Position information of the block, `np.ndarray`, with shape of [4, 2]
           * `line_number`: The line number of the box (first line `line_number==0`), boxes with the same value indicate they are on the same line

    """
    return self.text_formula_ocr.recognize(img, return_text, **kwargs)

Pix2Text

Classes

Pix2Text

Methods

__init__(self, *, layout_parser=None, text_formula_ocr=None, table_ocr=None, **kwargs) special

from_config(total_configs=None, enable_formula=True, enable_table=True, device=None, **kwargs) classmethod

recognize(self, img, file_type='text_formula', **kwargs)

recognize_formula(self, imgs, batch_size=1, return_text=True, rec_config=None, **kwargs)

recognize_page(self, img, page_number=0, page_id=None, **kwargs)

recognize_pdf(self, pdf_fp, pdf_number=0, pdf_id=None, page_numbers=None, **kwargs)

recognize_text(self, imgs, return_text=True, rec_config=None, **kwargs)

recognize_text_formula(self, img, return_text=True, **kwargs)

`Pix2Text`

`init(self, *, layout_parser=None, text_formula_ocr=None, table_ocr=None, **kwargs)` `special`

`from_config(total_configs=None, enable_formula=True, enable_table=True, device=None, **kwargs)` `classmethod`

`recognize(self, img, file_type='text_formula', **kwargs)`

`recognize_formula(self, imgs, batch_size=1, return_text=True, rec_config=None, **kwargs)`

`recognize_page(self, img, page_number=0, page_id=None, **kwargs)`

`recognize_pdf(self, pdf_fp, pdf_number=0, pdf_id=None, page_numbers=None, **kwargs)`

`recognize_text(self, imgs, return_text=True, rec_config=None, **kwargs)`

`recognize_text_formula(self, img, return_text=True, **kwargs)`