The Vertica Forum recently got a makeover! Let us know what you think by filling out this short, anonymous survey.
Please take this survey to help us learn more about how you use third party tools. Your input is greatly appreciated!

HPE Vertica Connector for Apache Spark: Error reading DataFrame

Scenario:

Read data into Spark Dataframe from Vertica VMart table using "com.vertica.spark.datasource.DefaultSource"

Env:

HPE Vertica Community Edition VM

select version();

Vertica Analytic Database v7.2.0-0

Spark version: 

version 1.5.0

Downloaded "HPE Vertica Connector For Apache Spark" from 

https://saas.hpe.com/marketplace/haven/hpe-vertica-connector-apache-spark 

 

Repro:

spark-shell --jars /Users/jackg/projects/BigDataInfra/hpe_vertica_spark_connector/vertica-jdbc-7.1.2-0.jar,/Users/jackg/projects/BigDataInfra/hpe_vertica_spark_connector/vertica-spark-connector.jar --master local[1]

 

scala> val opt: Map[String, String] = Map("table" -> "customer_dimension", "db" -> "VMart", "user" -> "dbadmin", "password" -> "redacted", "host" -> "192.168.1.90", "dbschema" -> "public", "port" -> "5433", "numPartitions"-> "4", "tmpdir" -> "/tmp")

opt: Map[String,String] = Map(host -> 192.168.1.90, tmpdir -> /tmp, db -> VMart, dbschema -> public, port -> 5433, user -> dbadmin, numPartitions -> 4, table -> customer_dimension, password -> password)

 

scala> val df = sqlContext.read.format("com.vertica.spark.datasource.DefaultSource").options(opt).load()

df: org.apache.spark.sql.DataFrame = [customer_key: bigint, customer_type: string, customer_name: string, customer_gender: string, title: string, household_id: bigint, customer_address: string, customer_city: string, customer_state: string, customer_region: string, marital_status: string, customer_age: bigint, number_of_children: bigint, annual_income: bigint, occupation: string, largest_bill_amount: bigint, store_membership_card: bigint, customer_since: date, deal_stage: string, deal_size: bigint, last_deal_update: date]

 

// we have a DF

scala> df.rdd.partitions.size

res0: Int = 4

 

scala> df.printSchema

root

 |-- customer_key: long (nullable = false)

 |-- customer_type: string (nullable = true)

...

 

Test:

// Prove that we can work with RDDs in Spark 1.5.0
scala> val df1 = sc.makeRDD(1 to 12, 4).toDF("IntCol") // RDD with 4 partitions to a DataFrame
scala> df1.take(4).foreach(println)
[1]
[2]
[3]
[4]

 

// get a row from the Vertica connector DF

scala> df.take(1).foreach(println)   

15/11/21 15:10:06 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)

java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.SpecificMutableRow.setString(ILjava/lang/String;)V

at com.vertica.spark.datasource.VerticaDataSourceRDD$$anon$1.getNext(VerticaRDD.scala:368)

at com.vertica.spark.datasource.VerticaDataSourceRDD$$anon$1.hasNext(VerticaRDD.scala:421)

at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)

at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308)

at scala.collection.Iterator$class.foreach(Iterator.scala:727)

at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)

at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)

at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)

at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)

at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)

at scala.collection.AbstractIterator.to(Iterator.scala:1157)

at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)

at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)

at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)

at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)

at org.apache.spark.sql.execution.SparkPlan$$anonfun$5.apply(SparkPlan.scala:215)

at org.apache.spark.sql.execution.SparkPlan$$anonfun$5.apply(SparkPlan.scala:215)

at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1839)

at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1839)

at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)

at org.apache.spark.scheduler.Task.run(Task.scala:88)

at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)

at java.lang.Thread.run(Thread.java:745)

15/11/21 15:10:06 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-0,5,main]

java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.SpecificMutableRow.setString(ILjava/lang/String;)V

at com.vertica.spark.datasource.VerticaDataSourceRDD$$anon$1.getNext(VerticaRDD.scala:368)

at com.vertica.spark.datasource.VerticaDataSourceRDD$$anon$1.hasNext(VerticaRDD.scala:421)

at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)

at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308)

at scala.collection.Iterator$class.foreach(Iterator.scala:727)

at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)

at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)

at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)

at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)

at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)

at scala.collection.AbstractIterator.to(Iterator.scala:1157)

at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)

at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)

at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)

at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)

at org.apache.spark.sql.execution.SparkPlan$$anonfun$5.apply(SparkPlan.scala:215)

at org.apache.spark.sql.execution.SparkPlan$$anonfun$5.apply(SparkPlan.scala:215)

at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1839)

at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1839)

at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)

at org.apache.spark.scheduler.Task.run(Task.scala:88)

at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)

at java.lang.Thread.run(Thread.java:745)

15/11/21 15:10:06 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.SpecificMutableRow.setString(ILjava/lang/String;)V

at com.vertica.spark.datasource.VerticaDataSourceRDD$$anon$1.getNext(VerticaRDD.scala:368)

at com.vertica.spark.datasource.VerticaDataSourceRDD$$anon$1.hasNext(VerticaRDD.scala:421)

at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)

at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308)

at scala.collection.Iterator$class.foreach(Iterator.scala:727)

at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)

at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)

at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)

at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)

at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)

at scala.collection.AbstractIterator.to(Iterator.scala:1157)

at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)

at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)

at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)

at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)

at org.apache.spark.sql.execution.SparkPlan$$anonfun$5.apply(SparkPlan.scala:215)

at org.apache.spark.sql.execution.SparkPlan$$anonfun$5.apply(SparkPlan.scala:215)

at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1839)

at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1839)

at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)

at org.apache.spark.scheduler.Task.run(Task.scala:88)

at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)

at java.lang.Thread.run(Thread.java:745)

 

15/11/21 15:10:06 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job

 

Comments

  • I tested this on Spark 1.4.1 and it works.

    scala> df.take(2).foreach(println)

    [5,Individual,Craig S. Robinson,Male,Sir,10858,138 Alden Ave,Fayetteville,NC,East,Unknown,19,0,79772,Detective,153,0,1986-12-20,null,null,null]

    [18,Individual,Mark M. Kramer,Male,Dr.,28093,311 Green St,Joliet,IL,MidWest,Separated,58,1,363387,Chef,844,0,2006-05-18,null,null,null]

     

Leave a Comment

BoldItalicStrikethroughOrdered listUnordered list
Emoji
Image
Align leftAlign centerAlign rightToggle HTML viewToggle full pageToggle lights
Drop image/file