Hi,
I want to download the .mkv files in my S3 bucket .
I tried with a sample S3 bucket with subfolders. It is working very smoothly.
But in my production side, my .mkv files are bit large, and the download via python code is taking very very long time to complete.
Whereas, if I use tools like Cybertruck, the entire 15GB file is downloaded in 15 minutes. So , it means , there is nothing wrong with the settings of AWS S3 bucket and its policies.
My code is as below.
import os
import shutil
import time
from os import path, makedirs
from cloudpathlib import CloudPath
from cloudpathlib import S3Client
downloadFolder = input('Enter the folder path for saving Downloaded Files:')
convertedFolder = input('Enter the folder path for saving Converted Files:')
# read the access variable from file
credential = os.path.expanduser(os.getcwd()+os.sep+'credentials.txt')
myvars = {}
with open(credential, "r") as myfile:
for line in myfile:
line = line.strip()
name, var = line.partition("=")[::2]
myvars[name.strip()] = str(var)
# access variables
access_key = "{}".format(myvars['access_key'])
secret_access_key = "{}".format(myvars['secret_access_key'])
bucket_name = "{}".format(myvars['bucket_name'])
folder_path = "{}".format(myvars['folder_path'])
# Connect to s3 service
client = S3Client(aws_access_key_id=access_key, aws_secret_access_key=secret_access_key)
# s3 = boto3.client('s3', region, aws_access_key_id=access_key, aws_secret_access_key=secret_access_key)
root_dir = CloudPath("s3://"+bucket_name+"/", client=client)
#find the number of files in s3 bucket
totalFileCount=0
for f in root_dir.glob(folder_path+'/*'):
totalFileCount = totalFileCount+1
print('Total no. of files')
print(totalFileCount)
# for every two seconds, print the status of download/converted
measure1 = time.time()
measure2 = time.time()
filesCompleted = 0
for f in root_dir.glob(folder_path+'/*'):
filename = f.name
print("file= "+filename)
curFileName = 'store1_' + filename
f.download_to(downloadFolder+os.sep+curFileName)
# convert .mkv to .mp4
newName, ext = os.path.splitext(curFileName)
outFileName = newName + '.mp4'
src_path = downloadFolder + os.sep + curFileName
dst_path = convertedFolder + os.sep + outFileName
shutil.copy(src_path, dst_path)
# For every two seconds print the status
if measure2 - measure1 >= 2:
# Find total no. of files in Downloaded folder
currentlyDownloadedFiles = os.listdir(downloadFolder)
curDownloadCount = len(currentlyDownloadedFiles)
curConvertedFiles = os.listdir(convertedFolder)
curConvertedCount = len(curConvertedFiles)
print("Status ==> Downloaded: " + str(curDownloadCount) + "/" + str(totalFileCount) + " Converted: " + str(
curConvertedCount) + "/" + str(totalFileCount))
measure1 = measure2
measure2 = time.time()
else:
measure2 = time.time()
# client.set_as_default_client()
# S3Client.get_default_client()
#continue printing status until the Downloads and Converting files are fully complete
while (curDownloadCount < totalFileCount or curConvertedCount < totalFileCount):
currentlyDownloadedFiles = os.listdir(downloadFolder)
curDownloadCount = len(currentlyDownloadedFiles)
curConvertedFiles = os.listdir(convertedFolder)
curConvertedCount=len(curConvertedFiles)
# For every two seconds print the status
if measure2 - measure1 >= 2:
# Find total no. of files in Downloaded folder
currentlyDownloadedFiles = os.listdir(downloadFolder)
curDownloadCount = len(currentlyDownloadedFiles)
curConvertedFiles = os.listdir(convertedFolder)
curConvertedCount = len(curConvertedFiles)
print("Status ==> Downloaded: " + str(curDownloadCount) + "/" + str(totalFileCount) + " Converted: " + str(
curConvertedCount) + "/" + str(totalFileCount))
measure1 = measure2
measure2 = time.time()
else:
measure2 = time.time()
Please let me know where I am wrong.
Thanks,
Sabarisri