============================================================
INSTALL APACHE SPARK AND ZEPPELIN ON LAPTOP OR DESKTOP

Date: Mon Feb  2 11:45:45 AM EST 2026

OS: Linux
Platform: Rocky 9
Hadoop Version: 3.3.6
Spark Version:  3.5.6
============================================================

# NOTE: Previous versions of Spark and Hive have enjoyed synergistic relationship.
# However, recent versions and distributions have limited the ability to 
# use each others metastore (database where data about tables are stored)
# There are some solutions (The Hive Warehouse Connector (HWC)), however
# as installed the two tools cannot share metadata and thus tables.

Step 1: Download Spark
=======================

wget --no-check-certificate -P /tmp https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-without-hadoop.tgz
wget -P /tmp  https://archive.apache.org/dist/spark/spark-3.5.6/spark-3.5.6-bin-without-hadoop.tgz
# Next extract the package in /opt:

  mkdir -p /opt/
  tar xvzf /tmp/spark-3.5.6-bin-without-hadoop.tgz -C /opt


Step 2: Set Spark Path and JAVA_HOME 
====================================

# Set spark profile and path

echo 'export PATH=$PATH:/opt/spark-3.5.6-bin-without-hadoop/bin; export SPARK_HOME=/opt/spark-3.5.6-bin-without-hadoop' >/etc/profile.d/spark.sh

# copy config files and link to hive config

#  cp files/log4j.properties /opt/spark-3.5.6-bin-without-hadoop/conf/
  cp files/spark-defaults.conf /opt/spark-3.5.6-bin-without-hadoop/conf/
  cp files/spark-env.sh /opt/spark-3.5.6-bin-without-hadoop/conf/
  ln -s /opt/apache-hive-4.0.1-bin/conf/hive-site.xml /opt/spark-3.5.6-bin-without-hadoop/conf/hive-site.xml
  ln -s /opt/hadoop-3.3.6/etc/hadoop/core-site.xml /opt/spark-3.5.6-bin-without-hadoop/conf/core-site.xml
  ln -s /opt/hadoop-3.3.6/etc/hadoop/hdfs-site.xml /opt/spark-3.5.6-bin-without-hadoop/conf/hdfs-site.xml

# Create a Spark user and change ownership (do as root)

  useradd -g hadoop spark
  chown -R spark:hadoop /opt/spark-3.5.6-bin-without-hadoop

# Create HDFS user (as root become user hdfs) 

  su - hdfs 
  hdfs dfs -mkdir -p /user/spark
  hdfs dfs -chown -R spark:hadoop  /user/spark
  exit
# Create log directory

  mkdir /var/log/spark
  chown spark:hadoop /var/log/spark

# Create spark-events directory for spart UI

  mkdir /tmp/spark-events
  chown spark:hadoop /tmp/spark-events/
  chmod ug+rwx /tmp/spark-events/

Step 3: Test Spark Install
==========================

# If you have not logged out run ". /etc/profile.d/spark.sh" before running the tests.
# this file will be used automatic on all subsequent logins.
# Run the pi example

  run-example SparkPi 10

# Start the Spark shell (in Scala) Use ":q" to quit. Starts a local version with one thread.

  spark-shell 

# Start a Spark shell in Python (Python must be installed) Starts a local master with two threads
# using the "--master local[2]" option. Use ctrl-D or "quit()" to quit. 

  pyspark 

# also run the pispark pi example

  spark-submit $SPARK_HOME/examples/src/main/python/pi.py 10

# Finally, start the R front-end (Still experimental and R must be installed) Use "q()" to quit.

 sparkR --master local

Step 4: Test Connecting to HDFS
===============================

# Make sure HDFS is started and use user hands-on

  su - hands-on

# Copy a hive log file to hdfs

  hdfs dfs -put /tmp/hadoop-service-startup.log

# Start PySpark

  pyspark

# to read a local file, "hadoop-service-startup.log",  use (your count will vary):

  >>> file=sc.textFile("file:///tmp/hadoop-service-startup.log")
  >>> file.count()
  19565

# to read a file from HDFS, hive.metastore.log" use (your count will vary)

  >>> file=sc.textFile("/user/hands-on/hadoop-service-startup.log")
  >>> file.count()
  13643

Step 5: Install Spark History Server
====================================

# Copy start/stop scripts to /opt/services and change to owner spark

  cp scripts/*-spark-history.sh /opt/services/
  chmod 744 /opt/services/*-spark-history.sh
  chown spark:hadoop /opt/services/*-spark-history.sh

# Copy the systemd service script and enable

  cp files/spark-history.service /etc/systemd/system
  chmod 644 /etc/systemd/system/spark-history.service
  systemctl enable spark-history.service

# make sure LHM firewall allows port 18080 (README.first)
# make sure Virtualbox forwards port 18080 (README.first)

Step 6: Install Zeppelin Web Notebook
=====================================

# The following may need some additional configuration for you environment.

  wget -P /tmp https://dlcdn.apache.org/zeppelin/zeppelin-0.11.2/zeppelin-0.11.2-bin-all.tgz

  tar xvzf /tmp/zeppelin-0.11.2-bin-all.tgz -C /opt

# Once Zeppelin is extracted into /opt, copy the configuration file 
# The new configuration sets the Zeppelin port to 9995 (instead of 8080)

  cp files/zeppelin-site.xml /opt/zeppelin-0.11.2-bin-all/conf
  cp files/zeppelin-env.sh /opt/zeppelin-0.11.2-bin-all/conf

# Create a Zeppelin user and change ownership (do as root)

  useradd -g hadoop zeppelin 
  chown -R zeppelin:hadoop /opt/zeppelin-0.11.2-bin-all

# make the log directory
  mkdir /var/log/zeppelin
  chown zeppelin:hadoop /var/log/zeppelin/

# Create HDFS zeppelin user (as root become user hdfs) 

  su - hdfs 
  hdfs dfs -mkdir -p /user/zeppelin
  hdfs dfs -chown -R zeppelin:hadoop  /user/zeppelin
  exit

# Copy Zeppelin start and stop scripts to /opt/services

  cp scripts/*-zeppelin.sh /opt/services/
  chmod 744 /opt/services/*-zeppelin.sh
  chown zeppelin:hadoop /opt/services/*-zeppelin.sh

# Copy the systemd service script

  cp files/zeppelin.service /etc/systemd/system
  chmod 644 /etc/systemd/system/zeppelin.service
  systemctl enable zeppelin.service

# Create a staging directory

  mkdir -p /opt/staging
  chmod 777 /opt/staging
 
Step 7: Starting Zeppelin
=========================

# use Systemd to start, stop, restart Zeppelin
 
  systemctl start zeppelin

# the start and stop scripts used by Systemd are located in  
# you should not have to use these scripts

  /opt/services/start-zeppelin.sh  
  /opt/services/stop-zeppelin.sh

# See the Zeppelin page for more info

   https://zeppelin.apache.org


Step 8: Connect to Zeppelin and Set python3 Path (NOT NEEDED)
================================================

# use a web browser to point to "localhost:9995" For example:

  firefox http://localhost:9995

# this should produce the Zeppelin WebUI, if it does not
# check the Zeppelin logs (make sure VM has port 9995 open)

# set Zeppelin to use Python 3 from python3 installed above

Python Interpreter Settings
---------------------------
# Using the Zeppelin WebUI, go to Anonymous menu in the upper right corner
# Select the Interpreter option to bring up the interpreter configuration page
# Scroll down to Python Interpreter section, select Edit
# Set the following (name/value) to the python3 path

  name                    value
  zeppelin.python         /usr/bin/python3

# go to bottom and add in the empty box, for the two names below, 
# add the name and value, click "+" when done

  name                    value
  PYSPARK_DRIVER_PYTHON   /usr/bin/python3
  PYSPARK_PYTHON          /usr/bin/python3

# click save and update/restart the interpreter

Hive Interpreter settings
-------------------------
# Create Hive interpreter
# Open zeppelin web UI, click Anonymous and select Interpreter
click create

enter "hive" for Interpreter Name
select "jdbc" for Interpreter Group

# Enter the following

  default.url: jdbc:hive2://localhost:10000 
  default.name hive
  default.driver: 	org.apache.hive.jdbc.HiveDriver 

# Click "Save"

# Move to the jdbc interpter directory and copy in all the needed jar files

# add jar files to  /opt/zeppelin-0.11.2-bin-all/interpreter/jdbc/

  cd /opt/zeppelin-0.11.2-bin-all/interpreter/jdbc/ 

  cp /opt/hadoop-3.3.6/share/hadoop/common/hadoop-common-3.3.6.jar .
  cp /opt/hadoop-3.3.6/share/hadoop/common/lib/httpcore-4.4.13.jar .
  cp /opt/hadoop-3.3.6/share/hadoop/common/lib/curator-client-5.2.0.jar .
  cp /opt/hadoop-3.3.6/share/hadoop/common/lib/stax2-api-4.2.1.jar .
  cp /opt/hadoop-3.3.6/share/hadoop/client/hadoop-client-runtime-3.3.6.jar .
  cp /opt/hadoop-3.3.6/share/hadoop/common/lib/woodstox-core-5.4.0.jar .
 
  cp /opt/apache-hive-4.0.1-bin/lib/hive-service-rpc-4.0.1.jar .
  cp /opt/apache-hive-4.0.1-bin/lib/hive-serde-4.0.1.jar . 
  cp /opt/apache-hive-4.0.1-bin/lib/libthrift-0.16.0.jar .
  cp /opt/apache-hive-4.0.1-bin/lib/hive-common-4.0.1.jar .
  cp /opt/apache-hive-4.0.1-bin/lib/hive-jdbc-4.0.1.jar .
  cp /opt/apache-hive-4.0.1-bin/lib/commons-collections-3.2.2.jar .
  cp /opt/apache-hive-4.0.1-bin/lib/guava-27.0-jre.jar .
  cp /opt/apache-hive-4.0.1-bin/lib/hive-service-4.0.1.jar .

# at this point it is probably best to restart zeppelin

  systemctl stop zeppelin
  systemctl start zeppelin

Step 9: Test the Notebook
========================= 

# There is a notebook called "Spring-2026 Basic Tests (Python, PySpark,Hive, sh)" or similar
# You should be able to load and run each paragraph
