Basic Operations |
Variable Declaration |
val x: Int = 10 // Immutable
var y: Int = 20 // Mutable |
|
Collections |
val list = List(1, 2, 3, 4, 5)
val array = Array(1, 2, 3, 4, 5)
val map = Map("a" -> 1, "b" -> 2, "c" -> 3) |
Spark Setup |
Initialize Spark Session |
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.appName("Spark App").config("spark.master", "local").getOrCreate() |
RDD Operations |
Create RDD |
val rdd = spark.sparkContext.parallelize(Seq(1, 2, 3, 4, 5)) |
|
Transformations |
val mappedRDD = rdd.map(_ * 2)
val filteredRDD = rdd.filter(_ > 2) |
|
Actions |
val collected = rdd.collect()
val count = rdd.count()
val firstElement = rdd.first() |
DataFrame Operations |
Create DataFrame |
import spark.implicits._
val df = Seq((1, "a"), (2, "b"), (3, "c")).toDF("id", "value") |
|
Show DataFrame |
df.show() |
|
DataFrame Transformations |
val filteredDF = df.filter($"id" > 1)
val selectedDF = df.select("value")
val withColumnDF = df.withColumn("new_column", $"id" * 2) |
|
SQL Queries |
df.createOrReplaceTempView("table")
val sqlDF = spark.sql("SELECT * FROM table WHERE id > 1") |
Dataset Operations |
Create Dataset |
case class Record(id: Int, value: String)
val ds = Seq(Record(1, "a"), Record(2, "b"), Record(3, "c")).toDS() |
|
Dataset Transformations |
val filteredDS = ds.filter(_.id > 1)
val mappedDS = ds.map(record => record.copy(value = record.value.toUpperCase)) |
Conversions |
RDD to DataFrame |
val rddToDF = rdd.toDF("numbers") |
|
DataFrame to RDD |
val dfToRDD = df.rdd |
|
DataFrame to Dataset |
val dfToDS = df.as[Record] |
|
Dataset to DataFrame |
val dsToDF = ds.toDF() |
Reading and Writing Data |
Read CSV |
val csvDF = spark.read.option("header", "true").csv("path/to/file.csv") |
|
Write CSV |
df.write.option("header", "true").csv("path/to/save") |
|
Read Parquet |
val parquetDF = spark.read.parquet("path/to/file.parquet") |
|
Write Parquet |
df.write.parquet("path/to/save") |
Common Data Engineering Functions |
GroupBy and Aggregations |
val groupedDF = df.groupBy("id").count()
val aggregatedDF = df.groupBy("id").agg(sum("value")) |
|
Join Operations |
val df1 = Seq((1, "a"), (2, "b")).toDF("id", "value1")
val df2 = Seq((1, "x"), (2, "y")).toDF("id", "value2")
val joinedDF = df1.join(df2, "id") |
|
Window Functions |
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
val windowSpec = Window.partitionBy("id").orderBy("value")
val windowedDF = df.withColumn("rank", rank().over(windowSpec)) |
|
UDFs (User-Defined Functions) |
import org.apache.spark.sql.functions.udf
val addOne = udf((x: Int) => x + 1)
val dfWithUDF = df.withColumn("new_value", addOne($"id")) |