Pandas overflow error when reading large .txt file from S3

0

I have a large (>10gb) txt file on S3 that I trying to load into pandas. I have tried a couple different approaches, but haven't been able to successfully load / read the data.

Python 3.8.12

As for the memory, I am running this on an instance with 32 Gib of RAM, so I don't think it's a memory issue.

import pandas as pd
import boto3

# AWS credentials
import boto3
aws_id = 'xxxx'
aws_secret = 'xxxx'
Client = boto3.client(
                      's3',
                      aws_access_key_id=aws_id,
                      aws_secret_access_key=aws_secret
                     )


# Read data from S3
result = Client.get_object(Bucket="bucket-1", Key = "file1.txt")
print(result)

# To Pandas DataFrame
df_loan = pd.read_csv(io.BytesIO(result['Body'].read()), sep="\t", dtype='object', header=None)

Traceback:

---------------------------------------------------------------------------
OverflowError                             Traceback (most recent call last)
/tmp/ipykernel_4239/3915305936.py in <cell line: 2>()
      1 # To Pandas DataFrame
----> 2 df_loan = pd.read_csv(io.BytesIO(result['Body'].read()), sep="\t", dtype='object', header=None)

~/anaconda3/envs/python3/lib/python3.8/site-packages/botocore/response.py in read(self, amt)
     97         """
     98         try:
---> 99             chunk = self._raw_stream.read(amt)
    100         except URLLib3ReadTimeoutError as e:
    101             # TODO: the url will be None as urllib3 isn't setting it yet

~/anaconda3/envs/python3/lib/python3.8/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
    513             if amt is None:
    514                 # cStringIO doesn't like amt=None
--> 515                 data = self._fp.read() if not fp_closed else b""
    516                 flush_decoder = True
    517             else:

~/anaconda3/envs/python3/lib/python3.8/http/client.py in read(self, amt)
    470             else:
    471                 try:
--> 472                     s = self._safe_read(self.length)
    473                 except IncompleteRead:
    474                     self._close_conn()

~/anaconda3/envs/python3/lib/python3.8/http/client.py in _safe_read(self, amt)
    611         IncompleteRead exception can be used to detect the problem.
    612         """
--> 613         data = self.fp.read(amt)
    614         if len(data) < amt:
    615             raise IncompleteRead(data, amt-len(data))

~/anaconda3/envs/python3/lib/python3.8/socket.py in readinto(self, b)
    667         while True:
    668             try:
--> 669                 return self._sock.recv_into(b)
    670             except timeout:
    671                 self._timeout_occurred = True

~/anaconda3/envs/python3/lib/python3.8/ssl.py in recv_into(self, buffer, nbytes, flags)
   1239                   "non-zero flags not allowed in calls to recv_into() on %s" %
   1240                   self.__class__)
-> 1241             return self.read(nbytes, buffer)
   1242         else:
   1243             return super().recv_into(buffer, nbytes, flags)

~/anaconda3/envs/python3/lib/python3.8/ssl.py in read(self, len, buffer)
   1097         try:
   1098             if buffer is not None:
-> 1099                 return self._sslobj.read(len, buffer)
   1100             else:
   1101                 return self._sslobj.read(len)

OverflowError: signed integer is greater than maximum
已提问 1 年前686 查看次数
1 回答
0

Using .read() would load the full dataset into the memory at once. Try avoiding that and using another method that reads line by line. Or even better, use the latest pandas to read directly from S3 location - http://pandas.pydata.org/pandas-docs/stable/user_guide/io.html - I can see V1.5 and V1.4 support S3 locations.

For example,

import pandas as pd
import boto3

# AWS credentials
import boto3
aws_id = 'xxxx'
aws_secret = 'xxxx'
Client = boto3.client(
                      's3',
                      aws_access_key_id=aws_id,
                      aws_secret_access_key=aws_secret
                     )

# To Pandas DataFrame
df_loan = pd.read_csv('s3://bucket-1/file1.txt')

If you are not using the latest versions of pandas, try using an alternative of read()

example:

# Read data from S3
result = Client.get_object(Bucket="bucket-1", Key = "file1.txt")
print(result)

for i,line in enumerate(result['Body'].iter_lines()):
     line_decoded = line.decode('utf-8')
     #Do your processing part here
profile pictureAWS
已回答 1 年前

您未登录。 登录 发布回答。

一个好的回答可以清楚地解答问题和提供建设性反馈,并能促进提问者的职业发展。

回答问题的准则