Spark SQL 写入 Json 格式文件报错 org.apache.spark.sql.AnalysisException: Found duplicate column(s)
json
适用于现代 C++ 的 JSON。
项目地址:https://gitcode.com/gh_mirrors/js/json
免费下载资源
·
错误场景
如下两个 Json 文件
person.json
{"name":"路飞","age":17,"deptno":1,"money":15}
{"name":"索隆","age":18,"deptno":1,"money":9}
{"name":"乔巴","age":5,"deptno":1,"money":5}
{"name":"艾斯","age":18,"deptno":2,"money":18}
{"name":"萨博","age":18,"deptno":2,"money":16}
{"name":"香克斯","age":32,"deptno":3,"money":30}
dept.json
{"name":"草帽","deptno":1}
{"name":"白胡子","deptno":2}
{"name":"红发","deptno":3}
当通过如下语句,join 两个表在写入到 新的 Json 文件的时候报错
val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
// 1)相关处理
val person = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")
person.join(dept, person("deptno") === dept("deptno"), "outer").repartition(1)
.write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")
错误如下:
org.apache.spark.sql.AnalysisException: Found duplicate column(s) when inserting into file:/C:/Users/LUFFY/Desktop/testData/joinJson: `deptno`, `name`;
at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtils.scala:85)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:68)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:676)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:290)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
at sparkSql.demo1.testError(demo1.scala:142)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
at org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
at org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
at org.junit.runners.ParentRunner.run(ParentRunner.java:363)
at org.junit.runner.JUnitCore.run(JUnitCore.java:137)
at com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68)
at com.intellij.rt.execution.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:47)
at com.intellij.rt.execution.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:242)
at com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:70)
问题原因
列名重复
解决方案
修改列名,如下:
@Test
def testJoinSave() ={
val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
// 1)相关处理
val person = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")
//另存为 Json
//分别拿出两张表的列名
val c_person = person.columns
val c_dept = dept.columns
//分别对两张表的别名进行设置
val person_tmp = person.select(c_person.map(n => person(n).as("person_" + n)): _*)
val dept_tmp = dept.select(c_dept.map(n => dept(n).as("dept_" + n)): _*)
person_tmp.join(dept_tmp, person_tmp("person_deptno") === dept_tmp("dept_deptno"), "outer").repartition(1)
.write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")
}
GitHub 加速计划 / js / json
41.72 K
6.61 K
下载
适用于现代 C++ 的 JSON。
最近提交(Master分支:1 个月前 )
960b763e
3 个月前
8c391e04
6 个月前
更多推荐
已为社区贡献2条内容
所有评论(0)