I am trying to load a file from S3 bucket using AWS lambda using langchain document loaders
I first tried using S3FileLoader when it gave the read-only file error.
So I tried downloading the docx file first from the S3 bucket and then used the specific document loader "UnstructuredWordDocumentLoader" as it was the word document I uploaded. but it still gave the same error.
Eventually I want to load any type of document in S3 bucket and generate embeddings to store in an opensearch vector database.
Also if I try to deploy my lambda with docker using image public.ecr.aws/lambda/python:3.11
I get a error "FileNotFoundError: soffice command was not found. Please install libreoffice"
Example Code
`
from langchain_community.document_loaders import S3FileLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
#first approach
#loader = S3FileLoader(s3_bucket, s3_key)
#docs = loader.load()
#second approach
s3.download_file(s3_bucket, s3_key, f"/tmp/{s3_key}")
with open(f"/tmp/{s3_key}", "rb") as f:
loader = UnstructuredWordDocumentLoader(f"/tmp/{s3_key}")
docs = loader.load()
`
[ERROR] OSError: [Errno 30] Read-only file system: '/home/sbx_user1051'
Traceback (most recent call last):
File "/var/task/aws_lambda_powertools/logging/logger.py", line 449, in decorate
return lambda_handler(event, context, *args, **kwargs)
File "/var/task/lambda_handler.py", line 54, in handler
docs = loader.load()
File "/var/task/langchain_community/document_loaders/unstructured.py", line 87, in load
elements = self._get_elements()
File "/var/task/langchain_community/document_loaders/word_document.py", line 124, in _get_elements
return partition_docx(filename=self.file_path, **self.unstructured_kwargs)
File "/var/task/unstructured/documents/elements.py", line 526, in wrapper
elements = func(*args, **kwargs)
File "/var/task/unstructured/file_utils/filetype.py", line 619, in wrapper
elements = func(*args, **kwargs)
File "/var/task/unstructured/file_utils/filetype.py", line 574, in wrapper
elements = func(*args, **kwargs)
File "/var/task/unstructured/chunking/init.py", line 69, in wrapper
elements = func(*args, **kwargs)
File "/var/task/unstructured/partition/docx.py", line 228, in partition_docx
return list(elements)
File "/var/task/unstructured/partition/lang.py", line 397, in apply_lang_metadata
elements = list(elements)
File "/var/task/unstructured/partition/docx.py", line 305, in _iter_document_elements
yield from self._iter_paragraph_elements(block_item)
File "/var/task/unstructured/partition/docx.py", line 541, in _iter_paragraph_elements
yield from self._classify_paragraph_to_element(item)
File "/var/task/unstructured/partition/docx.py", line 361, in _classify_paragraph_to_element
TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
File "/var/task/unstructured/partition/docx.py", line 868, in _parse_paragraph_text_for_element_type
if is_possible_narrative_text(text):
File "/var/task/unstructured/partition/text_type.py", line 78, in is_possible_narrative_text
if exceeds_cap_ratio(text, threshold=cap_threshold):
File "/var/task/unstructured/partition/text_type.py", line 274, in exceeds_cap_ratio
if sentence_count(text, 3) > 1:
File "/var/task/unstructured/partition/text_type.py", line 223, in sentence_count
sentences = sent_tokenize(text)
File "/var/task/unstructured/nlp/tokenize.py", line 29, in sent_tokenize
_download_nltk_package_if_not_present(package_category="tokenizers", package_name="punkt")
File "/var/task/unstructured/nlp/tokenize.py", line 23, in _download_nltk_package_if_not_present
nltk.download(package_name)
File "/var/task/nltk/downloader.py", line 777, in download
for msg in self.incr_download(info_or_id, download_dir, force):
File "/var/task/nltk/downloader.py", line 642, in incr_download
yield from self._download_package(info, download_dir, force)
File "/var/task/nltk/downloader.py", line 699, in _download_package
os.makedirs(download_dir)
File "", line 215, in makedirs
File "", line 225, in makedirs