Pdf reader
PdfReader
¶
Bases: Reader
PDF reader.
Source code in agentuniverse/agent/action/knowledge/reader/file/pdf_reader.py
Python
class PdfReader(Reader):
"""PDF reader."""
def load_data(self, file: Path, ext_info: Optional[Dict] = None) -> List[Document]:
"""Parse the pdf file.
Note:
`pypdf` is required to read PDF files: `pip install pypdf`
"""
try:
import pypdf
except ImportError:
raise ImportError(
"pypdf is required to read PDF files: `pip install pypdf`"
)
with open(file, "rb") as fp:
# Create a PDF object
pdf = pypdf.PdfReader(fp)
# Get the number of pages in the PDF document
num_pages = len(pdf.pages)
# Iterate over every page
docs = []
for page in range(num_pages):
# Extract the text from the page
page_text = pdf.pages[page].extract_text()
page_label = pdf.page_labels[page]
metadata = {"page_label": page_label, "file_name": file.name}
if ext_info is not None:
metadata.update(ext_info)
docs.append(Document(text=page_text, metadata=metadata))
return docs
load_data(file, ext_info=None)
¶
Parse the pdf file.
Note
pypdf is required to read PDF files: pip install pypdf
Source code in agentuniverse/agent/action/knowledge/reader/file/pdf_reader.py
Python
def load_data(self, file: Path, ext_info: Optional[Dict] = None) -> List[Document]:
"""Parse the pdf file.
Note:
`pypdf` is required to read PDF files: `pip install pypdf`
"""
try:
import pypdf
except ImportError:
raise ImportError(
"pypdf is required to read PDF files: `pip install pypdf`"
)
with open(file, "rb") as fp:
# Create a PDF object
pdf = pypdf.PdfReader(fp)
# Get the number of pages in the PDF document
num_pages = len(pdf.pages)
# Iterate over every page
docs = []
for page in range(num_pages):
# Extract the text from the page
page_text = pdf.pages[page].extract_text()
page_label = pdf.page_labels[page]
metadata = {"page_label": page_label, "file_name": file.name}
if ext_info is not None:
metadata.update(ext_info)
docs.append(Document(text=page_text, metadata=metadata))
return docs