Web pdf reader
WebPdfReader
¶
Bases: Reader
The agentUniverse(aU) web pdf reader.
The pdf file will be downloaded and then parsed by pdfminer.six.
Source code in agentuniverse/agent/action/knowledge/reader/file/web_pdf_reader.py
Python
class WebPdfReader(Reader):
"""The agentUniverse(aU) web pdf reader.
The pdf file will be downloaded and then parsed by `pdfminer.six`.
"""
def load_data(self, web_pdf_url: str) -> List[Document]:
if web_pdf_url is None:
return []
response = requests.get(web_pdf_url)
if response.status_code == 200:
# download the pdf file and convert it into a memory file.
pdf_memory_file = BytesIO(response.content)
try:
from pdfminer.high_level import extract_text_to_fp
except ImportError:
raise ImportError(
"pdfminer.six is required to read PDF files: `pip install pdfminer.six`"
)
# parse the pdf file and get the text content.
with BytesIO() as output_string:
extract_text_to_fp(pdf_memory_file, output_string, output_type='text', codec='utf-8')
text = output_string.getvalue().decode('utf-8')
return [Document(text=text, metadata={"source": web_pdf_url})]