利用spark迁移cassandra集群数据

20201126更新

采用map的方法替代for循环效率更高

val parmas = [(20201101,2,31),(20201101,2,32)];

val distinctIds = ids.value.distinct
sc.union(distinctIds.map(
  id => sc.cassandraTable(keyspaceName, "crawl_hub_response")
    .select("id", "target_domain", "task_minute")
    .where("id = ?", id._1)
    .where("target_domain = ?", id._2)
    .where("task_minute >= ?", startTaskMinute)
    .where("task_minute <= ?", endTaskMinute)
))

——————————————————————————————————————————————————

目前工作中遇到需求,需要将一个cassandra集群的数据迁移到另一个cassandra集群。

从网上查询发现有两个解决方案,如下

1. 读写的时候切换CassandraConnector


import com.datastax.spark.connector._
import com.datastax.spark.connector.cql._
import com.datastax.spark.connector.rdd.CassandraTableScanRDD
import org.apache.log4j.Logger
import org.apache.spark.{SparkConf, SparkContext}
object TwoCassandraTransfer {
  val logger = Logger.getLogger(TwoCassandraTransfer.getClass().getName())
  def twoClusterExample(sc: SparkContext, dateStr: String, cassandraTableSource: String, cassandraTableDest: String) = {
    val connectorToClusterOne = CassandraConnector(sc.getConf.set("spark.cassandra.connection.host", "host1"))
    val connectorToClusterTwo = CassandraConnector(sc.getConf.set("spark.cassandra.connection.host", "host2"))
    val rddFromClusterOne = {
      // Sets connectorToClusterOne as default connection for everything in this code block
      implicit val c = connectorToClusterOne
      logger.info("spark.cassandra.connection.host1:" + cassandraTableSource)
      val sourceArr: Array[String] = cassandraTableSource.split("\\.")
      var resultRdd: CassandraTableScanRDD[CassandraRow] = null
      for (hour <- 0 until 24) {
        for (bucketId <- 0 until 10) {
          var hourStr: String = "%02d".format(hour);
          val tempRdd = sc.cassandraTable(sourceArr(0), sourceArr(1)).where(("event_day = '%s' and event_hour = '%s' and company_bucket = %d").format(dateStr, hourStr, bucketId))
          if (resultRdd == null) {
            resultRdd = tempRdd
          } else {
            resultRdd = resultRdd.union(tempRdd)
          }
        }
      }
      resultRdd
    }
    {
      //Sets connectorToClusterTwo as the default connection for everything in this code block
      implicit val c = connectorToClusterTwo
      logger.info("spark.cassandra.connection.host2:" + cassandraTableDest)
      val destArr: Array[String] = cassandraTableDest.split("\\.")
      rddFromClusterOne.saveToCassandra(destArr(0), destArr(1))
    }
  }
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("TransCassandraMove-test")
      .set("spark.cassandra.connection.host", "host")
      .set("spark.cassandra.connection.port", "port")
      .set("spark.cassandra.connection.timeout_ms", "10000")
      .set("spark.cassandra.read.timeout_ms", "10000")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .set("spark.cassandra.input.consistency.level", "QUORUM")
      .set("spark.cassandra.output.consistency.level", "ANY")
    val sc = new SparkContext(sparkConf)
    // 输入keyspace1.table,keyspace2.table
    val inputStr = args(0)
    val inputArr: Array[String] = inputStr.split(",")
    val cassandraTableSource = inputArr(0).trim
    val cassandraTableDest = inputArr(1).trim
    val dateStr = inputArr(2).trim
    twoClusterExample(sc, dateStr, cassandraTableSource, cassandraTableDest)
  }
}

但是会报错spark java.lang.StackOverflowError :http://www.zhyea.com/2018/07/02/spark-java-lang-stackoverflowerror.html

分析应该是rdd.union太多了。

 

2. 用datafr



object TwoCassandraTransfer {
  val logger = Logger.getLogger(TwoCassandraTransfer.getClass().getName())

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("TransCassandraMove-test")
    val sc = new SparkContext(sparkConf)
    val spark = SparkSession.builder().config(conf = sparkConf).enableHiveSupport().getOrCreate()

    // 输入keyspace1.table,keyspace2.table
    val inputStr = args(0)
    val inputArr: Array[String] = inputStr.split(",")
    val cassandraTableSource = inputArr(0).trim
    val cassandraTableDest = inputArr(1).trim
    val dateStr = inputArr(2).trim
    val connectorToClusterOne = CassandraConnector(sc.getConf.set("spark.cassandra.connection.host", "cq02-bce-bigdata-dw04.cq02.baidu.com"))
    val connectorToClusterTwo = CassandraConnector(sc.getConf.set("spark.cassandra.connection.host", "m1-bce-bigdata-dw02.m1.baidu.com"))

    val sourceArr: Array[String] = cassandraTableSource.split("\\.")
    val df_table = spark.read.format("org.apache.spark.sql.cassandra")
      .option("table", sourceArr(1))
      .option("keyspace", sourceArr(0)).load()

    var res_all = spark.emptyDataFrame
    for (hour <- 0 until 24) {
      for (bucketId <- 0 until 10) {
        var hourStr: String = "%02d".format(hour);
        var res = df_table.where("event_day = '%s' and event_hour = '%s' and company_bucket = %d".format(dateStr, hourStr, bucketId))
        if (res_all == spark.emptyDataFrame) {
          res_all = res;
        }
        else {
          res_all = res_all.unionAll(res);
        }
      }
    }

    {
      //Sets connectorToClusterTwo as the default connection for everything in this code block
      implicit val c = connectorToClusterTwo
      val destArr: Array[String] = cassandraTableDest.split("\\.")
      res_all.rdd.saveToCassandra(destArr(0), destArr(1))
    }
  }

}

仍然报错:Exception in thread "main" scala.ScalaReflectionException: <none> is not a term https://github.com/scalapb/sparksql-scalapb/issues/11

要迁移的表中有null字段,导致这个问题。

 

最后选择第一种方案,union 240次改成union 24次,每次company_bucket用in 操作,但是会提示 java.io.IOException: Failed to write statements  he latest exception was  Timed out waiting for server response

修改如下参数即可:

.set("spark.cassandra.connection.timeout_ms", "30000")
.set("spark.cassandra.read.timeout_ms", "30000")
.set("auto.commit.interval.ms ", "5000")
已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页