Following this post: https://repost.aws/knowledge-center/glue-reduce-cloudwatch-logs
I have created the following glue job:
from awsglue.context import GlueContext
from pyspark.context import SparkContext
import logging
from pyspark.sql.functions import rand, randn
DEFAULT_LOG_LEVEL = logging.DEBUG
DEFAULT_LOG_FORMAT = "%(asctime)s - %(levelname)-8s: %(message)s"
class SparkLogger(logging.Handler):
def __init__(self, j_logger) -> None:
self.j_logger = j_logger
logging.Handler.__init__(self=self)
def emit(self, record: logging.LogRecord) -> None:
print(record)
self.j_logger.log(None, None, record.levelno, logging.Handler.format(self, record), None, None)
def get_logger(
name: str, level: int = DEFAULT_LOG_LEVEL, format: str = DEFAULT_LOG_FORMAT
) -> logging.Logger:
logger = logging.getLogger(name)
logger.setLevel(level)
glueContext = GlueContext(SparkContext.getOrCreate())
handler = SparkLogger(glueContext.get_logger().log())
handler.setLevel(level)
handler.setFormatter(logging.Formatter(fmt=format, datefmt="%Y-%m-%d %H:%M:%S"))
logger.addHandler(handler)
return logger
def main():
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
logger = get_logger("MEEEJN")
logger.info("info message")
logger.warn("warn message")
logger.error("error message")
# Create a DataFrame with random data
# Here we are creating a DataFrame with two columns: 'id' and 'value'
# 'id' column will have random integers and 'value' column will have random floats
spark = glueContext.spark_session
df = spark.range(0, 100).withColumn('id', (rand(seed=1234) * 100).cast('int')).withColumn('value', randn(seed=42))
# Perform a transformation: Let's filter rows where 'value' > 0 and then add a new column 'value_squared' which is the square of 'value'
transformed_df = df.filter(df['value'] > 0).withColumn('value_squared', df['value'] ** 2)
# Collect and print the top 6 rows of the transformed DataFrame
top_rows = transformed_df.take(6)
print("Top 6 Rows:")
for row in top_rows:
print(row)
if __name__ == "__main__":
main()
Then, at the job details, libraries section in the Referenced files path input, i put the s3 uri of a log4j.properties file that contains the following:
# Root Logger
log4j.rootLogger=DEBUG, stdout
log4j.logger.org.apache.spark.api.python.PythonGatewayServer=DEBUG
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%m%n
This, in my understanding should make my continous logs contain only the log message, no date and other information provided. Yet my logs in cloudwatch still look like this:
What am I doing wrong?
Is that Glue 3 or 4?
We are using glue version 4.0
Also looking through the cloudtrail, it does not seem glue even tried to pick up the file, no s3 operations are being made