Pyspark Code failing while connecting to Oracle database ----Invalid Oracle URL specified

0

Hello All

I have created 3 docker containers running in one network using docker images as follows : postgres aws glue image oracle image

Sharing docker yml for same .

version: "2"

services:
 spark-postgres:
  image: postgres:latest
  container_name: spark-postgres
  build: ./postgresql
  restart: always
  hostname: spark-postgres
  env_file:
   - ./env/postgresdb-env-vars.env
  ports:
   - "5432:5432"
  volumes:
  - ./data/territoryhub-replication/postgresql:/var/lib/postgresql/data
  networks:
   glue-network:
    ipv4_address: 10.4.0.4
 spark-oracle:
  image: oracle:test
  container_name: spark-oracle
  build: ./oracle
  restart: always
  hostname: spark-oracle
  env_file:
   - ./env/oracledb-env-vars.env
  ports:
   - "1521:1521"
  volumes:
  - ./data/territoryhub-replication/oracle:/opt/oracle/oradata
  - ./oracle/oracle-scripts:/opt/oracle/scripts/startup
  networks:
   glue-network:
    ipv4_address: 10.4.0.5
 spark-master:
  image: spark-master
  container_name: spark-master
  build: ./spark
  hostname: spark-master
  depends_on :
   - spark-postgres
   - spark-oracle
  ports:
   - "8888:8888"
   - "4040:4040"
  env_file: 
   - ./env/spark-env-vars.env
  command :  "/home/jupyter/jupyter_start.sh"
  volumes:
  - ../app/territoryhub-replication:/home/jupyter/jupyter_default_dir
  networks:
   glue-network:
    ipv4_address: 10.4.0.3
networks:
  glue-network:
    driver: bridge
    ipam:
      config:
       - subnet: 10.4.0.0/16
         gateway: 10.4.0.1 
         

 
  
  


Also I would like to mention I was not finding oracle jdbc driver to connect to oracle database so in my glue image (spark-master) I have added jar file . Sharing docker file of that

FROM amazon/aws-glue-libs:glue_libs_1.0.0_image_01
COPY jar/ojdbc8.jar   /home/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8/jars/ojdbc8.jar




RUN mkdir -p /root/.aws
RUN echo "[default]\nregion=us-east-1" >> /root/.aws/config

Now I am simply trying to connect to oracle database created on same network from my local which I am running as follows : http://localhost:8888/tree?

pyspark code is

from pyspark import SparkContext
from awsglue.context import GlueContext

glueContext = GlueContext(SparkContext.getOrCreate()) 

inputDF = glueContext.create_dynamic_frame_from_options(connection_type = "oracle", connection_options = {"url": "jdbc:oracle:thin:@//10.4.0.5:1521:ORCLCDB", "user": 'system', "password": '<some pwd>' ,"dbtable": "<some table name>"})
inputDF.toDF().show()

I am getting below error

Invalid Oracle URL specified

An error was encountered:
An error occurred while calling o303.getDynamicFrame.
: java.sql.SQLException: Invalid Oracle URL specified
	at oracle.jdbc.driver.PhysicalConnection.parseUrl(PhysicalConnection.java:1738)
	at oracle.jdbc.driver.PhysicalConnection.readConnectionProperties(PhysicalConnection.java:1419)
	at oracle.jdbc.driver.PhysicalConnection.<init>(PhysicalConnection.java:943)
	at oracle.jdbc.driver.PhysicalConnection.<init>(PhysicalConnection.java:928)
	at oracle.jdbc.driver.T4CConnection.<init>(T4CConnection.java:557)
	at oracle.jdbc.driver.T4CDriverExtension.getConnection(T4CDriverExtension.java:68)
	at oracle.jdbc.driver.OracleDriver.connect(OracleDriver.java:732)
	at oracle.jdbc.driver.OracleDriver.connect(OracleDriver.java:648)
	at com.amazonaws.services.glue.util.JDBCWrapper$$anonfun$8.apply(JDBCUtils.scala:900)
	at com.amazonaws.services.glue.util.JDBCWrapper$$anonfun$8.apply(JDBCUtils.scala:896)
	at com.amazonaws.services.glue.util.JDBCWrapper$$anonfun$connectWithSSLAttempt$1$$anonfun$apply$6.apply(JDBCUtils.scala:852)
	at scala.Option.getOrElse(Option.scala:121)
	at com.amazonaws.services.glue.util.JDBCWrapper$$anonfun$connectWithSSLAttempt$1.apply(JDBCUtils.scala:852)
	at scala.Option.getOrElse(Option.scala:121)
	at com.amazonaws.services.glue.util.JDBCWrapper$.connectWithSSLAttempt(JDBCUtils.scala:852)
	at com.amazonaws.services.glue.util.JDBCWrapper$.connectionProperties(JDBCUtils.scala:895)
	at com.amazonaws.services.glue.util.JDBCWrapper.connectionProperties$lzycompute(JDBCUtils.scala:671)
	at com.amazonaws.services.glue.util.JDBCWrapper.connectionProperties(JDBCUtils.scala:671)
	at com.amazonaws.services.glue.util.JDBCWrapper.tableDF(JDBCUtils.scala:797)
	at com.amazonaws.services.glue.util.NoCondition$.tableDF(JDBCUtils.scala:85)
	at com.amazonaws.services.glue.util.NoJDBCPartitioner$.tableDF(JDBCUtils.scala:124)
	at com.amazonaws.services.glue.JDBCDataSource.getDynamicFrame(DataSource.scala:863)
	at com.amazonaws.services.glue.DataSource$class.getDynamicFrame(DataSource.scala:97)
	at com.amazonaws.services.glue.SparkSQLDataSource.getDynamicFrame(DataSource.scala:683)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

Traceback (most recent call last):
  File "/home/aws-glue-libs/awsglue.zip/awsglue/context.py", line 204, in create_dynamic_frame_from_options
    return source.getFrame(**kwargs)
  File "/home/aws-glue-libs/awsglue.zip/awsglue/data_source.py", line 36, in getFrame
    jframe = self._jsource.getDynamicFrame()
  File "/home/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/home/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/home/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
    format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o303.getDynamicFrame.
: java.sql.SQLException: Invalid Oracle URL specified
	at oracle.jdbc.driver.PhysicalConnection.parseUrl(PhysicalConnection.java:1738)
	at oracle.jdbc.driver.PhysicalConnection.readConnectionProperties(PhysicalConnection.java:1419)
	at oracle.jdbc.driver.PhysicalConnection.<init>(PhysicalConnection.java:943)
	at oracle.jdbc.driver.PhysicalConnection.<init>(PhysicalConnection.java:928)
	at oracle.jdbc.driver.T4CConnection.<init>(T4CConnection.java:557)
	at oracle.jdbc.driver.T4CDriverExtension.getConnection(T4CDriverExtension.java:68)
	at oracle.jdbc.driver.OracleDriver.connect(OracleDriver.java:732)
	at oracle.jdbc.driver.OracleDriver.connect(OracleDriver.java:648)
	at com.amazonaws.services.glue.util.JDBCWrapper$$anonfun$8.apply(JDBCUtils.scala:900)
	at com.amazonaws.services.glue.util.JDBCWrapper$$anonfun$8.apply(JDBCUtils.scala:896)
	at com.amazonaws.services.glue.util.JDBCWrapper$$anonfun$connectWithSSLAttempt$1$$anonfun$apply$6.apply(JDBCUtils.scala:852)
	at scala.Option.getOrElse(Option.scala:121)
	at com.amazonaws.services.glue.util.JDBCWrapper$$anonfun$connectWithSSLAttempt$1.apply(JDBCUtils.scala:852)
	at scala.Option.getOrElse(Option.scala:121)
	at com.amazonaws.services.glue.util.JDBCWrapper$.connectWithSSLAttempt(JDBCUtils.scala:852)
	at com.amazonaws.services.glue.util.JDBCWrapper$.connectionProperties(JDBCUtils.scala:895)
	at com.amazonaws.services.glue.util.JDBCWrapper.connectionProperties$lzycompute(JDBCUtils.scala:671)
	at com.amazonaws.services.glue.util.JDBCWrapper.connectionProperties(JDBCUtils.scala:671)
	at com.amazonaws.services.glue.util.JDBCWrapper.tableDF(JDBCUtils.scala:797)
	at com.amazonaws.services.glue.util.NoCondition$.tableDF(JDBCUtils.scala:85)
	at com.amazonaws.services.glue.util.NoJDBCPartitioner$.tableDF(JDBCUtils.scala:124)
	at com.amazonaws.services.glue.JDBCDataSource.getDynamicFrame(DataSource.scala:863)
	at com.amazonaws.services.glue.DataSource$class.getDynamicFrame(DataSource.scala:97)
	at com.amazonaws.services.glue.SparkSQLDataSource.getDynamicFrame(DataSource.scala:683)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

Purnima
asked 2 years ago1524 views
2 Answers
0

Hello,

I would like to inform you that there is a slight correction required for Oracle URL used in dynamic frame creation. Please use URL in below format:

"url": "jdbc:oracle:thin:@//<jdbc-host-name>:1521/ORCL"
AWS
answered 2 years ago
0

Hello Shubham , Thanks for taking out time . I tried with ORCL and ORCLPDB1 also but still getting issue and getting below mentioned error .

 java.io.InvalidObjectException: JDBC connection URL jdbc:oracle:thin:@10.4.0.5:1521/ORCL is not supported. Check the Developer Guide for the list of supported data stores / URL formatting.
: JDBC connection URL jdbc:oracle:thin:@10.4.0.5:1521/ORCLPDB1 is not supported. Check the Developer Guide for the list of supported data stores / URL formatting.

Also I would like to add that instead of using glueContext.create_dynamic_frame_from_options I used spark session to read DB and that is working fine

from pyspark import SparkContext
url='jdbc:oracle:thin:@10.4.0.5:1521/ORCLPDB1'
user='system'
password='root'
dbtable='xxgmdmadm.MDM_DNB_MONITORING_DTL'

df=spark.read.format('jdbc')\
.option('url',url)\
.option('dbtable',dbtable)\
.option('user',user)\
.option('password',password).load()
df.printSchema()

and converting into dynamic frame dy_output = DynamicFrame.fromDF(df, glueContext, "Data")

Purnima
answered 2 years ago

You are not logged in. Log in to post an answer.

A good answer clearly answers the question and provides constructive feedback and encourages professional growth in the question asker.

Guidelines for Answering Questions