Hey all,
I am trying to run the script below in the writefile titled "vw_aws_a_bijlageprofile.py". This code has worked for me using other data sources, but now I am getting the following error message from the CloudWatch Logs:
"***2022-08-24T20:09:19.708-05:00
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv***"
Any idea how I get around this error?
Full code below.
Thank you in advance!!!!
%%writefile vw_aws_a_bijlageprofile.py
import os
import sys
import subprocess
def install(package):
subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package])
install('awswrangler')
install('tqdm')
install('pandas')
install('botocore')
install('ruamel.yaml')
install('pandas-profiling')
import awswrangler as wr
import pandas as pd
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
from string import Template
import gc
import boto3
from pandas_profiling import ProfileReport
client = boto3.client('s3')
session = boto3.Session(region_name="eu-west-2")
def run_profile():
query = """
SELECT * FROM "intl-euro-archmcc-database"."vw_aws_a_bijlage"
;
"""
#swich table name above
tableforprofile = wr.athena.read_sql_query(query,
database="intl-euro-archmcc-database",
boto3_session=session,
ctas_approach=False,
workgroup='DataScientists')
print("read in the table queried above")
print("got rid of missing and added a new index")
profile_tblforprofile = ProfileReport(tableforprofile,
title="Pandas Profiling Report",
minimal=True)
print("Generated table profile")
return profile_tblforprofile
if __name__ == '__main__':
profile_tblforprofile = run_profile()
print("Generated outputs")
output_path_tblforprofile = ('/opt/ml/processing/output/profile_vw_aws_a_bijlage.html')
#switch profile name above
print(output_path_tblforprofile)
profile_tblforprofile.to_file(output_path_tblforprofile)
import sagemaker
from sagemaker.processing import ProcessingInput, ProcessingOutput
session = boto3.Session(region_name="eu-west-2")
bucket = 'intl-euro-uk-datascientist-prod'
prefix = 'Mark'
sm_session = sagemaker.Session(boto_session=session, default_bucket=bucket)
sm_session.upload_data(path='vw_aws_a_bijlageprofile.py',
bucket=bucket,
key_prefix=f'{prefix}/source')
import boto3
#import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
region = boto3.session.Session().region_name
S3_ROOT_PATH = "s3://{}/{}".format(bucket, prefix)
role = get_execution_role()
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
role=role,
sagemaker_session=sm_session,
instance_type='ml.m5.24xlarge',
instance_count=1)
sklearn_processor.run(code='s3://{}/{}/source/vw_aws_a_bijlageprofile.py'.format(bucket, prefix),
inputs=[],
outputs=[ProcessingOutput(output_name='output',
source='/opt/ml/processing/output',
destination='s3://intl-euro-uk-datascientist-prod/Mark/IODataProfiles/')])
huh, that is really interesting. Maybe that isn't the problem then. I just re-ran the above and got the same error message, maybe I didn't include enough details though: sagemaker-sklearn-container 1.0 requires jinja2==2.10.2, but you have jinja2 3.1.2 which is incompatible. sagemaker-sklearn-container 1.0 requires MarkupSafe==1.1.1, but you have markupsafe 2.1.1 which is incompatible. sagemaker-sklearn-container 1.0 requires numpy==1.19.5, but you have numpy 1.21.6 which is incompatible. sagemaker-sklearn-container 1.0 requires pandas==0.25.*, but you have pandas 1.3.5 which is incompatible.
I can send you the cloudwatch processing jobs error log in a CSV or image file but even when I screenshot the error log, which is a 74 KB PNG file, I cannot add it here.
Hey @Ivan - can I send over the error log please?