Scala Cheatsheet for Spark

Category Operation Code Snippet
Basic Operations Variable Declaration val x: Int = 10 // Immutable
var y: Int = 20 // Mutable
  Collections val list = List(1, 2, 3, 4, 5)
val array = Array(1, 2, 3, 4, 5)
val map = Map("a" -> 1, "b" -> 2, "c" -> 3)
Spark Setup Initialize Spark Session import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.appName("Spark App").config("spark.master", "local").getOrCreate()
RDD Operations Create RDD val rdd = spark.sparkContext.parallelize(Seq(1, 2, 3, 4, 5))
  Transformations val mappedRDD = rdd.map(_ * 2)
val filteredRDD = rdd.filter(_ > 2)
  Actions val collected = rdd.collect()
val count = rdd.count()
val firstElement = rdd.first()
DataFrame Operations Create DataFrame import spark.implicits._
val df = Seq((1, "a"), (2, "b"), (3, "c")).toDF("id", "value")
  Show DataFrame df.show()
  DataFrame Transformations val filteredDF = df.filter($"id" > 1)
val selectedDF = df.select("value")
val withColumnDF = df.withColumn("new_column", $"id" * 2)
  SQL Queries df.createOrReplaceTempView("table")
val sqlDF = spark.sql("SELECT * FROM table WHERE id > 1")
Dataset Operations Create Dataset case class Record(id: Int, value: String)
val ds = Seq(Record(1, "a"), Record(2, "b"), Record(3, "c")).toDS()
  Dataset Transformations val filteredDS = ds.filter(_.id > 1)
val mappedDS = ds.map(record => record.copy(value = record.value.toUpperCase))
Conversions RDD to DataFrame val rddToDF = rdd.toDF("numbers")
  DataFrame to RDD val dfToRDD = df.rdd
  DataFrame to Dataset val dfToDS = df.as[Record]
  Dataset to DataFrame val dsToDF = ds.toDF()
Reading and Writing Data Read CSV val csvDF = spark.read.option("header", "true").csv("path/to/file.csv")
  Write CSV df.write.option("header", "true").csv("path/to/save")
  Read Parquet val parquetDF = spark.read.parquet("path/to/file.parquet")
  Write Parquet df.write.parquet("path/to/save")
Common Data Engineering Functions GroupBy and Aggregations val groupedDF = df.groupBy("id").count()
val aggregatedDF = df.groupBy("id").agg(sum("value"))
  Join Operations val df1 = Seq((1, "a"), (2, "b")).toDF("id", "value1")
val df2 = Seq((1, "x"), (2, "y")).toDF("id", "value2")
val joinedDF = df1.join(df2, "id")
  Window Functions import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
val windowSpec = Window.partitionBy("id").orderBy("value")
val windowedDF = df.withColumn("rank", rank().over(windowSpec))
  UDFs (User-Defined Functions) import org.apache.spark.sql.functions.udf
val addOne = udf((x: Int) => x + 1)
val dfWithUDF = df.withColumn("new_value", addOne($"id"))