I have a very basic AWS Glue ETL job that I created to perform a SQL select transformation and export the results in CSV format. This works, however; the output generates around 20 unique files and since the dataset only has two rows in it right now this is problematic. My requirement is to have a single CSV file that contains all of the data selected from the dataset. I have tried both repartition and coalesce function unsuccessfully. I am able to generate the single file, but my data is missing. Most likely because it is in a different partition. I am new to AWS Glue and have been unable to figure this out so any suggestions will be much appreciated.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node PostgreSQL
PostgreSQL_node1644981751584 = glueContext.create_dynamic_frame.from_catalog(
database="newApp",
table_name="database_schema_staging_hdr",
transformation_ctx="PostgreSQL_node1644981751584",
)
# Script generated for node SQL
SqlQuery0 = """
select * from myDataSource
"""
SQL_node1644981807578 = sparkSqlQuery(
glueContext,
query=SqlQuery0,
mapping={"myDataSource": PostgreSQL_node1644981751584},
transformation_ctx="SQL_node1644981807578",
)
# Script generated for node Amazon S3
AmazonS3_node1644981816657 = glueContext.write_dynamic_frame.from_options(
frame=SQL_node1644981807578,
connection_type="s3",
format="csv",
connection_options={"path": "s3://awsglueetloutput/", "partitionKeys": []},
transformation_ctx="AmazonS3_node1644981816657",
)
job.commit()