20201126更新
采用map的方法替代for循环效率更高
val parmas = [(20201101,2,31),(20201101,2,32)];
val distinctIds = ids.value.distinct
sc.union(distinctIds.map(
id => sc.cassandraTable(keyspaceName, "crawl_hub_response")
.select("id", "target_domain", "task_minute")
.where("id = ?", id._1)
.where("target_domain = ?", id._2)
.where("task_minute >= ?", startTaskMinute)
.where("task_minute <= ?", endTaskMinute)
))
——————————————————————————————————————————————————
目前工作中遇到需求,需要将一个cassandra集群的数据迁移到另一个cassandra集群。
从网上查询发现有两个解决方案,如下
1. 读写的时候切换CassandraConnector
import com.datastax.spark.connector._
import com.datastax.spark.connector.cql._
import com.datastax.spark.connector.rdd.CassandraTableScanRDD
import org.apache.log4j.Logger
import org.apache.spark.{SparkConf, SparkContext}
object TwoCassandraTransfer {
val logger = Logger.getLogger(TwoCassandraTransfer.getClass().getName())
def twoClusterExample(sc: SparkContext, dateStr: String, cassandraTableSource: String, cassandraTableDest: String) = {
val connectorToClusterOne = CassandraConnector(sc.getConf.set("spark.cassandra.connection.host", "host1"))
val connectorToClusterTwo = CassandraConnector(sc.getConf.set("spark.cassandra.connection.host", "host2"))
val rddFromClusterOne = {
// Sets connectorToClusterOne as default connection for everything in this code block
implicit val c = connectorToClusterOne
logger.info("spark.cassandra.connection.host1:" + cassandraTableSource)
val sourceArr: Array[String] = cassandraTableSource.split("\\.")
var resultRdd: CassandraTableScanRDD[CassandraRow] = null
for (hour <- 0 until 24) {
for (bucketId <- 0 until 10) {
var hourStr: String = "%02d".format(hour);
val tempRdd = sc.cassandraTable(sourceArr(0), sourceArr(1)).where(("event_day = '%s' and event_hour = '%s' and company_bucket = %d").format(dateStr, hourStr, bucketId))
if (resultRdd == null) {
resultRdd = tempRdd
} else {
resultRdd = resultRdd.union(tempRdd)
}
}
}
resultRdd
}
{
//Sets connectorToClusterTwo as the default connection for everything in this code block
implicit val c = connectorToClusterTwo
logger.info("spark.cassandra.connection.host2:" + cassandraTableDest)
val destArr: Array[String] = cassandraTableDest.split("\\.")
rddFromClusterOne.saveToCassandra(destArr(0), destArr(1))
}
}
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("TransCassandraMove-test")
.set("spark.cassandra.connection.host", "host")
.set("spark.cassandra.connection.port", "port")
.set("spark.cassandra.connection.timeout_ms", "10000")
.set("spark.cassandra.read.timeout_ms", "10000")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.cassandra.input.consistency.level", "QUORUM")
.set("spark.cassandra.output.consistency.level", "ANY")
val sc = new SparkContext(sparkConf)
// 输入keyspace1.table,keyspace2.table
val inputStr = args(0)
val inputArr: Array[String] = inputStr.split(",")
val cassandraTableSource = inputArr(0).trim
val cassandraTableDest = inputArr(1).trim
val dateStr = inputArr(2).trim
twoClusterExample(sc, dateStr, cassandraTableSource, cassandraTableDest)
}
}
但是会报错spark java.lang.StackOverflowError :http://www.zhyea.com/2018/07/02/spark-java-lang-stackoverflowerror.html
分析应该是rdd.union太多了。
2. 用datafr
object TwoCassandraTransfer {
val logger = Logger.getLogger(TwoCassandraTransfer.getClass().getName())
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("TransCassandraMove-test")
val sc = new SparkContext(sparkConf)
val spark = SparkSession.builder().config(conf = sparkConf).enableHiveSupport().getOrCreate()
// 输入keyspace1.table,keyspace2.table
val inputStr = args(0)
val inputArr: Array[String] = inputStr.split(",")
val cassandraTableSource = inputArr(0).trim
val cassandraTableDest = inputArr(1).trim
val dateStr = inputArr(2).trim
val connectorToClusterOne = CassandraConnector(sc.getConf.set("spark.cassandra.connection.host", "cq02-bce-bigdata-dw04.cq02.baidu.com"))
val connectorToClusterTwo = CassandraConnector(sc.getConf.set("spark.cassandra.connection.host", "m1-bce-bigdata-dw02.m1.baidu.com"))
val sourceArr: Array[String] = cassandraTableSource.split("\\.")
val df_table = spark.read.format("org.apache.spark.sql.cassandra")
.option("table", sourceArr(1))
.option("keyspace", sourceArr(0)).load()
var res_all = spark.emptyDataFrame
for (hour <- 0 until 24) {
for (bucketId <- 0 until 10) {
var hourStr: String = "%02d".format(hour);
var res = df_table.where("event_day = '%s' and event_hour = '%s' and company_bucket = %d".format(dateStr, hourStr, bucketId))
if (res_all == spark.emptyDataFrame) {
res_all = res;
}
else {
res_all = res_all.unionAll(res);
}
}
}
{
//Sets connectorToClusterTwo as the default connection for everything in this code block
implicit val c = connectorToClusterTwo
val destArr: Array[String] = cassandraTableDest.split("\\.")
res_all.rdd.saveToCassandra(destArr(0), destArr(1))
}
}
}
仍然报错:Exception in thread "main" scala.ScalaReflectionException: <none> is not a term https://github.com/scalapb/sparksql-scalapb/issues/11
要迁移的表中有null字段,导致这个问题。
最后选择第一种方案,union 240次改成union 24次,每次company_bucket用in 操作,但是会提示 java.io.IOException: Failed to write statements he latest exception was Timed out waiting for server response
修改如下参数即可:
.set("spark.cassandra.connection.timeout_ms", "30000")
.set("spark.cassandra.read.timeout_ms", "30000")
.set("auto.commit.interval.ms ", "5000")