Basic Operations | Variable Declaration | val x: Int = 10 // Immutable
var y: Int = 20 // Mutable |
| Collections | val list = List(1, 2, 3, 4, 5)
val array = Array(1, 2, 3, 4, 5)
val map = Map("a" -> 1, "b" -> 2, "c" -> 3) |
Spark Setup | Initialize Spark Session | import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.appName("Spark App").config("spark.master", "local").getOrCreate() |
RDD Operations | Create RDD | val rdd = spark.sparkContext.parallelize(Seq(1, 2, 3, 4, 5)) |
| Transformations | val mappedRDD = rdd.map(_ * 2)
val filteredRDD = rdd.filter(_ > 2) |
| Actions | val collected = rdd.collect()
val count = rdd.count()
val firstElement = rdd.first() |
DataFrame Operations | Create DataFrame | import spark.implicits._
val df = Seq((1, "a"), (2, "b"), (3, "c")).toDF("id", "value") |
| Show DataFrame | df.show() |
| DataFrame Transformations | val filteredDF = df.filter($"id" > 1)
val selectedDF = df.select("value")
val withColumnDF = df.withColumn("new_column", $"id" * 2) |
| SQL Queries | df.createOrReplaceTempView("table")
val sqlDF = spark.sql("SELECT * FROM table WHERE id > 1") |
Dataset Operations | Create Dataset | case class Record(id: Int, value: String)
val ds = Seq(Record(1, "a"), Record(2, "b"), Record(3, "c")).toDS() |
| Dataset Transformations | val filteredDS = ds.filter(_.id > 1)
val mappedDS = ds.map(record => record.copy(value = record.value.toUpperCase)) |
Conversions | RDD to DataFrame | val rddToDF = rdd.toDF("numbers") |
| DataFrame to RDD | val dfToRDD = df.rdd |
| DataFrame to Dataset | val dfToDS = df.as[Record] |
| Dataset to DataFrame | val dsToDF = ds.toDF() |
Reading and Writing Data | Read CSV | val csvDF = spark.read.option("header", "true").csv("path/to/file.csv") |
| Write CSV | df.write.option("header", "true").csv("path/to/save") |
| Read Parquet | val parquetDF = spark.read.parquet("path/to/file.parquet") |
| Write Parquet | df.write.parquet("path/to/save") |
Common Data Engineering Functions | GroupBy and Aggregations | val groupedDF = df.groupBy("id").count()
val aggregatedDF = df.groupBy("id").agg(sum("value")) |
| Join Operations | val df1 = Seq((1, "a"), (2, "b")).toDF("id", "value1")
val df2 = Seq((1, "x"), (2, "y")).toDF("id", "value2")
val joinedDF = df1.join(df2, "id") |
| Window Functions | import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
val windowSpec = Window.partitionBy("id").orderBy("value")
val windowedDF = df.withColumn("rank", rank().over(windowSpec)) |
| UDFs (User-Defined Functions) | import org.apache.spark.sql.functions.udf
val addOne = udf((x: Int) => x + 1)
val dfWithUDF = df.withColumn("new_value", addOne($"id")) |