Pandas overflow error when reading large .txt file from S3

0

I have a large (>10gb) txt file on S3 that I trying to load into pandas. I have tried a couple different approaches, but haven't been able to successfully load / read the data.

Python 3.8.12

As for the memory, I am running this on an instance with 32 Gib of RAM, so I don't think it's a memory issue.

import pandas as pd
import boto3

# AWS credentials
import boto3
aws_id = 'xxxx'
aws_secret = 'xxxx'
Client = boto3.client(
                      's3',
                      aws_access_key_id=aws_id,
                      aws_secret_access_key=aws_secret
                     )


# Read data from S3
result = Client.get_object(Bucket="bucket-1", Key = "file1.txt")
print(result)

# To Pandas DataFrame
df_loan = pd.read_csv(io.BytesIO(result['Body'].read()), sep="\t", dtype='object', header=None)

Traceback:

---------------------------------------------------------------------------
OverflowError                             Traceback (most recent call last)
/tmp/ipykernel_4239/3915305936.py in <cell line: 2>()
      1 # To Pandas DataFrame
----> 2 df_loan = pd.read_csv(io.BytesIO(result['Body'].read()), sep="\t", dtype='object', header=None)

~/anaconda3/envs/python3/lib/python3.8/site-packages/botocore/response.py in read(self, amt)
     97         """
     98         try:
---> 99             chunk = self._raw_stream.read(amt)
    100         except URLLib3ReadTimeoutError as e:
    101             # TODO: the url will be None as urllib3 isn't setting it yet

~/anaconda3/envs/python3/lib/python3.8/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
    513             if amt is None:
    514                 # cStringIO doesn't like amt=None
--> 515                 data = self._fp.read() if not fp_closed else b""
    516                 flush_decoder = True
    517             else:

~/anaconda3/envs/python3/lib/python3.8/http/client.py in read(self, amt)
    470             else:
    471                 try:
--> 472                     s = self._safe_read(self.length)
    473                 except IncompleteRead:
    474                     self._close_conn()

~/anaconda3/envs/python3/lib/python3.8/http/client.py in _safe_read(self, amt)
    611         IncompleteRead exception can be used to detect the problem.
    612         """
--> 613         data = self.fp.read(amt)
    614         if len(data) < amt:
    615             raise IncompleteRead(data, amt-len(data))

~/anaconda3/envs/python3/lib/python3.8/socket.py in readinto(self, b)
    667         while True:
    668             try:
--> 669                 return self._sock.recv_into(b)
    670             except timeout:
    671                 self._timeout_occurred = True

~/anaconda3/envs/python3/lib/python3.8/ssl.py in recv_into(self, buffer, nbytes, flags)
   1239                   "non-zero flags not allowed in calls to recv_into() on %s" %
   1240                   self.__class__)
-> 1241             return self.read(nbytes, buffer)
   1242         else:
   1243             return super().recv_into(buffer, nbytes, flags)

~/anaconda3/envs/python3/lib/python3.8/ssl.py in read(self, len, buffer)
   1097         try:
   1098             if buffer is not None:
-> 1099                 return self._sslobj.read(len, buffer)
   1100             else:
   1101                 return self._sslobj.read(len)

OverflowError: signed integer is greater than maximum
질문됨 일 년 전686회 조회
1개 답변
0

Using .read() would load the full dataset into the memory at once. Try avoiding that and using another method that reads line by line. Or even better, use the latest pandas to read directly from S3 location - http://pandas.pydata.org/pandas-docs/stable/user_guide/io.html - I can see V1.5 and V1.4 support S3 locations.

For example,

import pandas as pd
import boto3

# AWS credentials
import boto3
aws_id = 'xxxx'
aws_secret = 'xxxx'
Client = boto3.client(
                      's3',
                      aws_access_key_id=aws_id,
                      aws_secret_access_key=aws_secret
                     )

# To Pandas DataFrame
df_loan = pd.read_csv('s3://bucket-1/file1.txt')

If you are not using the latest versions of pandas, try using an alternative of read()

example:

# Read data from S3
result = Client.get_object(Bucket="bucket-1", Key = "file1.txt")
print(result)

for i,line in enumerate(result['Body'].iter_lines()):
     line_decoded = line.decode('utf-8')
     #Do your processing part here
profile pictureAWS
답변함 일 년 전

로그인하지 않았습니다. 로그인해야 답변을 게시할 수 있습니다.

좋은 답변은 질문에 명확하게 답하고 건설적인 피드백을 제공하며 질문자의 전문적인 성장을 장려합니다.

질문 답변하기에 대한 가이드라인

관련 콘텐츠