Catalogue
1. Connect Spark 2. Create Dataframe
2.1. Create 2.2 from the variable. Create 2.3 from a variable. Read JSON 2.4. Read CSV 2.5. Read MySQL 2.6. Created from Pandas.dataframe 2.7. Reads 2.8 from the parquet stored in the column. Read 3 from Hive. Save data
3.1. Write to CSV 3.2. Save to Parquet 3.3. Write to Hive 3.4. Write to HDFs 3.5. Write to MySQL 1. Connect Spark
From pyspark.sql import sparksession
spark=sparksession \.
builder \
. AppName (' my_first_app_name ') \
. Getorcreate ()
2. Create Dataframe
2.1. Creating from Variables
# Generate comma-delimited data
Stringcsvrdd = Spark.sparkContext.parallelize ([
(123, "Katie", "Brown"),
(234, "Michael") , "green"),
(345, "Simone", +, "blue")
]
# Specify the pattern, Structfield (name,datatype,nullable)
# Where:
# Name : The field names,
# DataType: The field's data type,
# Nullable: Indicates whether the field's value is empty from
pyspark.sql.types Import Structtype, Structfield, Longtype, StringType # importing type
schema = Structtype ([
Structfield ("id", Longtype (), true),
Structfield ("name", StringType (), true),
Structfield ("Age", Longtype (), true),
Structfield ("Eyecolor", StringType (), True)
]
# applies the pattern to the RDD and creates dataframe
swimmers = Spark.createdataframe (Stringcsvrdd,schema)
# Create a temporary view with Dataframe
swimmers.registertemptable ("Swimmers ")
# View the number of lines Dataframe
Swimmers.count ()
2.2. Creating from Variables
# Use automatic type inference to create dataframe
data = [(123, "Katie", "Brown"),
(234, "Michael", "a", "green"),
(345, "Simone", "Blue")]
df = spark.createdataframe (data, schema=[' id ', ' name ', ' age ', ' Eyccolor '))
df.show ()
Df.count ()
2.3. Read JSON
# Read the following sample data from spark
file = r "D:\hadoop_spark\spark-2.1.0-bin-hadoop2.7\examples\src\main\resources\people.json"
df = Spark.read.json (file)
df.show ()
2.4. Read CSV
# first create a CSV file
import pandas as PD
import numpy as NP
DF=PD. DataFrame (Np.random.rand (5,5), columns=[' A ', ' B ', ' C ', ' d ', ' e ']). \
Applymap (lambda x:int (x*10))
file=r "D : \hadoop_spark\spark-2.1.0-bin-hadoop2.7\examples\src\main\resources\random.csv "
df.to_csv (file,index= False)
# Read the CSV file again
monthlysales = spark.read.csv (file, Header=true, inferschema=true)
Monthlysales.show ()
2.5. Read MySQL
# At this time need to put the Mysql-jar driver under the Spark-2.2.0-bin-hadoop2.7\jars
# # Standalone environment feasible, cluster environment no
# re-execute
df = Spark.read.format (' JDBC '). Options (
url= ' jdbc:mysql://127.0.0.1 ',
dbtable= ' mysql.db ',
user= ' root ',
password= ' 123456 '
). Load ()
df.show ()
# can also pass in the SQL statement
sql= "(SELECT * from mysql.db where db= ' wp230 ') t"
df = Spark.read.format (' jdbc '). Options (
url= ' jdbc:mysql://127.0.0.1 ',
dbtable=sql,
user= ' root ',
password= ' 123456 '
). Load ()
df.show ()
2.6. Create from Pandas.dataframe
# If you do not specify schema then use Pandas's column name
df = PD. DataFrame (Np.random.random (bis))
SPARK_DF = Spark.createdataframe (df,schema=[' A ', ' B ', ' C ', ' d '])
2.7. Read from the parquet of the column store
# Read Example the parquet file below
file=r "D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\users.parquet "
Df=spark.read.parquet (file)
df.show ()
2.8. Read from Hive
# If you have configured Spark connection hive parameters, you can read hive data directly from
Spark = sparksession \
. Builder \
. Enablehivesupport () \.
Master ("172.31.100.170:7077") \
. AppName ("My_first_app_name") \
. Getorcreate ()
df=spark.sql ("SELECT * FROM Hive_tb_name ")
df.show ()
3. Save your data
3.1. Write to CSV
# Create Dataframe
import NumPy as NP
df = PD. DataFrame (Np.random.random ((4, 4)), columns=[' A ', ' B ', ' C ', ' d '])
SPARK_DF = Spark.createdataframe (DF)
# Write to CSV
file=r "D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.csv"
Spark_ Df.write.csv (Path=file, Header=true, sep= ",", mode= ' overwrite ')
3.2. Save to Parquet
# Create Dataframe
import NumPy as NP
df = PD. DataFrame (Np.random.random ((4, 4)), columns=[' A ', ' B ', ' C ', ' d '])
SPARK_DF = Spark.createdataframe (DF)
# Write to Parquet
file=r "D:\apps\spark-2.2.0-bin-hadoop2.7\examples\src\main\resources\test.parquet"
Spark_ Df.write.parquet (path=file,mode= ' overwrite ')
3.3. Write to Hive
# Open Dynamic partition
spark.sql ("Set hive.exec.dynamic.partition.mode = nonstrict")
spark.sql ("set Hive.exec.dynamic.partition=true ")
# Write to partition table
spark.sql (" ""
insert overwrite table Ai.da_ using normal Hive-sql Aipurchase_dailysale_hive
partition (saledate)
Select ProductID, PropertyId, Processcenterid, Saleplatform, SKU, POA, Salecount, saledate from
szy_aipurchase_tmp_szy_dailysale distribute by saledate
"" ")
# or JdbcDF.write.mode ("overwrite") with each rebuild of the partitioned table
. Partitionby ("Saledate"). Insertinto ("Ai.da_aipurchase_ Dailysale_hive ")
jdbcDF.write.saveAsTable (" Ai.da_aipurchase_dailysale_hive ", None," append ", partitionby= ' Saledate ')
# does not write partition table, just simple import into hive table
jdbcDF.write.saveAsTable ("Ai.da_aipurchase_dailysale_for_ema_predict ", none," overwrite ", none)
3.4. Write to HDFs
# data is written to HDFs, and
jdbcDF.write.mode ("overwrite") is saved in CSV format. Options (header= "true"). CSV ("/home/ai/da/da_aipurchase_ Dailysale_for_ema_predict.csv ")
3.5. Write to MySQL
# The fields are automatically aligned, that is, SPARK_DF columns do not necessarily have all the columns of the MySQL table in line
# overwrite empty the table and import the
spark_df.write.mode ("overwrite"). Format ("JDBC"). Options (
url= ' jdbc:mysql://127.0.0.1 ',
user= ' root ',
password= ' 123456 ',
dbtable= " Test.test ",
batchsize=", "
). Save ()
# append Append mode
spark_df.write.mode (" append "). Format (" JDBC "). Options (
url= ' jdbc:mysql://127.0.0.1 ',
user= ' root ',
password= ' 123456 ',
dbtable=" Test.test ",
batchsize=", "
). Save ()