Skip to content

Docx reader

DocxReader

Bases: Reader

Docx reader.

Source code in agentuniverse/agent/action/knowledge/reader/file/docx_reader.py
Python
class DocxReader(Reader):
    """Docx reader."""

    def load_data(self, file: Path, ext_info: Optional[Dict] = None) -> List[Document]:
        """Parse the docx file.

        Note:
            The docx file cannot be process in pagination.
            `docx2txt` is required to read DOCX files: `pip install docx2txt`
        """
        try:
            import docx2txt
        except ImportError:
            raise ImportError(
                "docx2txt is required to read Microsoft Word files: "
                "`pip install docx2txt`"
            )

        text = docx2txt.process(file)
        metadata = {"file_name": file.name}
        if ext_info is not None:
            metadata.update(ext_info)

        return [Document(text=text, metadata=metadata or {})]

load_data(file, ext_info=None)

Parse the docx file.

Note

The docx file cannot be process in pagination. docx2txt is required to read DOCX files: pip install docx2txt

Source code in agentuniverse/agent/action/knowledge/reader/file/docx_reader.py
Python
def load_data(self, file: Path, ext_info: Optional[Dict] = None) -> List[Document]:
    """Parse the docx file.

    Note:
        The docx file cannot be process in pagination.
        `docx2txt` is required to read DOCX files: `pip install docx2txt`
    """
    try:
        import docx2txt
    except ImportError:
        raise ImportError(
            "docx2txt is required to read Microsoft Word files: "
            "`pip install docx2txt`"
        )

    text = docx2txt.process(file)
    metadata = {"file_name": file.name}
    if ext_info is not None:
        metadata.update(ext_info)

    return [Document(text=text, metadata=metadata or {})]