Initiate a SparkContext
import pyspark
from pyspark import SparkContext
sc =SparkContext()
# Create a collection of data called RDD, Resilient Distributed Dataset.
# Computation in an RDD is automatically parallelized across the cluster.
nums= sc.parallelize([1,2,3,4])
# access the first row with take
nums.take(1)
# apply a transformation to the data with a lambda function
squared = nums.map(lambda x: x*x).collect()
for num in squared:
print('%i ' % (num))
[num for num in squared]
from pyspark.sql import Row
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
list_p = [('John',19),('Smith',29),('Adam',35),('Henry',50)]
rdd = sc.parallelize(list_p)
ppl = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
DF_ppl = sqlContext.createDataFrame(ppl)
If you want to access the type of each feature, you can use printSchema()
DF_ppl.printSchema()
from pyspark.sql import SQLContext
from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(url)
sqlContext = SQLContext(sc)
Can read the csv file with sqlContext.read.csv
You use inferSchema set to True to tell Spark to guess automatically the type of data. If you didn't set inferSchema to True, there are all in string.
df = sqlContext.read.csv(SparkFiles.get("adult_data.csv"), header=True, inferSchema= False)
df.printSchema()
df.show(5, truncate = False)
# Import all from `sql.types`
from pyspark.sql.types import *
# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
for name in names:
df = df.withColumn(name, df[name].cast(newType))
return df
# List of continuous features
CONTI_FEATURES = ['age', 'fnlwgt','capital-gain', 'educational-num', 'capital-loss', 'hours-per-week']
# Convert the type
df = convertColumn(df, CONTI_FEATURES, FloatType())
# Check the dataset
df.printSchema()
df.select('age','fnlwgt').show(5)
df.groupBy("education").count().sort("count",ascending=True).show()
# Get a summary statistics, of the data, use describe() :
### - count
### - mean
### - standarddeviation
### - min
### - max
df.describe('age','fnlwgt').show()
df.select('age','income').show(5)
See the descriptive statistics between two pairwise columns. For instance, you can count the number of people with income below or above 50k by education level. This operation is called a crosstab.
df.crosstab('age', 'income').sort("age_income").show()
df.drop('education_num').columns
df.filter(df.age > 40).count()
df.groupby('marital-status').agg({'capital-gain': 'mean', 'capital-loss': 'sum'}).show()
from pyspark.sql.functions import *
# 1 Select the column
age_square = df.select(col("age")**2)
# 2 Apply the transformation and add it to the DataFrame
df = df.withColumn("age_square", col("age")**2)
df.printSchema()