============================================================
INSTALL SINGLE INSTANCE OF APACHE HADOOP/HIVE FROM APACHE 

Date: Mon Feb  2 11:39:03 AM EST 2026

OS: Linux
Platform: Rocky Linux 9.6
Hadoop Version: 3.3.6
Java Version: java-11-openjdk.aarch64
Java Version: java-11-openjdk.x86_64
============================================================

# Note: To view web based UI's the examples below assume the Firefox browser. Almost
# any browser can be used in place of Firefox.

Step 1: Download Hadoop
=======================

# Unless otherwise noted the following steps are done by root
# Download the distribution (either x86 or ARM) from the Hadoop web site 
# (http://hadoop.apache.org/).   For example load the file to /tmp

# For x86 platforms
  wget -P /tmp https://archive.apache.org/dist/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz  
# For ARM (aarch) platforms
  wget -P /tmp  https://archive.apache.org/dist/hadoop/common/hadoop-3.3.6/hadoop-3.3.6-aarch64.tar.gz

# Next extract the package into /opt:

  mkdir -p /opt/

# For x86 platforms
  tar xvzf /tmp/hadoop-3.3.6.tar.gz -C /opt
# For ARM (aarch) platforms
  tar xvzf /tmp/hadoop-3.3.6-aarch64.tar.gz -C /opt


Step 2: Set JAVA_HOME and HADOOP_HOME
=====================================

# In the preparation step in the README.first file, java-11-openjdk.aarch64 was installed and set to the default
# java version. 

# Next we add java.sh to profile.d so that version is available

  echo 'export JAVA_HOME=/usr/lib/jvm/java-11-openjdk'> /etc/profile.d/java.sh
  echo 'export LOG4J_FORMAT_MSG_NO_LOOKUPS=true'>> /etc/profile.d/java.sh

# Add hadoop.sh to profile.d

  echo 'export HADOOP_HOME=/opt/hadoop-3.3.6;export PATH=$HADOOP_HOME/bin:$PATH' >/etc/profile.d/hadoop.sh

# To make sure JAVA_HOME and HADOOP_HOME are defined for this session, source the new script:

   source /etc/profile.d/java.sh
   source /etc/profile.d/hadoop.sh

Step 3A: Script Install
=======================

# The script "hadoop-setup-script.sh" (in the scripts directory) will perform 
# Steps 3 through 11.  Run the script as root and skip to Step 11. Steps 3 through 11
# perform the same tasks as the script. 
  
  cd scripts
  sh hadoop-setup-script.sh

Step 3: Create Users and Groups
===============================

# Consult the script and the config files in /root/Hadoop-Minimal-Install-Notes-V2/Hadoop-Hive/files/
# Includes the "hands-on" user

# (Skip to Step 11 if you ran the script in Step 3A)

  groupadd hadoop
  useradd -g hadoop yarn
  useradd -g hadoop hdfs
  useradd -g hadoop mapred
  useradd -g hadoop hands-on


Step 4: Make Data and Log Directories
=====================================

  mkdir -p /var/data/hadoop/hdfs/nn
  mkdir -p /var/data/hadoop/hdfs/snn
  mkdir -p /var/data/hadoop/hdfs/dn
  chown hdfs:hadoop /var/data/hadoop/hdfs -R

#  Create the log directory and set the owner and group as follows:

  mkdir /var/log/hadoop
  chown yarn:hadoop /var/log/hadoop/
  chmod ug+rwx /var/log/hadoop

Step 5: Configure core-site.xml  
===============================
# Add the following properties to /opt/hadoop-3.3.6/etc/hadoop/core-site.xml

<configuration>
   <property>
      <name>fs.default.name</name>
      <value>hdfs://localhost:9000</value>
   </property>
   <property>
      <name>hadoop.http.staticuser.user</name>
      <value>hdfs</value>
   </property>
   <property>
      <name> hadoop.proxyuser.hive.groups </name>
      <value> * </value>
    </property>
    <property>
      <name> hadoop.proxyuser.hive.hosts </name>
      <value> * </value>
    </property>
</configuration>

Step 6: Configure hdfs-site.xml
===============================
# Add the following properties to /opt/hadoop-3.3.6/etc/hadoop/hdfs-site.xml

<configuration>
 <property>
   <name>dfs.replication</name>
   <value>1</value>
 </property>
 <property>
   <name>dfs.namenode.name.dir</name>
   <value>file:/var/data/hadoop/hdfs/nn</value>
 </property>
 <property>
   <name>fs.checkpoint.dir</name>
   <value>file:/var/data/hadoop/hdfs/snn</value>
 </property>
 <property>
   <name>dfs.datanode.data.dir</name>
   <value>file:/var/data/hadoop/hdfs/dn</value>
 </property>
 <property>
   <name>dfs.permissions.superusergroup</name>
   <value>hadoop</value>
 </property>
<property>
   <name>dfs.encrypt.data.transfer</name>
   <value>false</value>
 </property>

</configuration>


Step 7: Configure mapred-site.xml
=================================
# copy the template file
 
  cp mapred-site.xml.template mapred-site.xml

# Add the following properties to /opt/hadoop-3.3.6/etc/hadoop/mapred-site.xml

<configuration>
<property>
   <name>mapreduce.framework.name</name>
   <value>yarn</value>
</property>
<property>
   <name>mapreduce.jobhistory.intermediate-done-dir</name>
   <value>/mr-history/tmp </value>
</property>
<property>
   <name> mapreduce.jobhistory.done-dir</name>
   <value>/mr-history/done</value>
</property>
<property>
  <name>yarn.app.mapreduce.am.env</name>
  <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
  <name>mapreduce.map.env</name>
  <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
  <name>mapreduce.reduce.env</name>
  <value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
</configuration>

Step 8: Configure yarn-site.xml
===============================
# Add the following properties to /opt/hadoop-3.3.6/etc/hadoop/yarn-site.xml

<configuration>
  <property>
      <name>yarn.nodemanager.aux-services</name>
      <value>mapreduce_shuffle</value>
  </property>
  <property>
      <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
      <value>org.apache.hadoop.mapred.ShuffleHandler</value>
  </property>
   <property>
      <name>yarn.log-aggregation-enable</name>
      <value>true</value>
  </property>
  <property>
      <name>yarn.nodemanager.remote-app-log-dir</name>
      <value>/app-logs</value>
  </property>
  <property>
      <name>yarn.nodemanager.remote-app-log-dir-suffix</name>
      <value>logs</value>
   </property>
   <property>
      <name>yarn.webapp.ui2.enable</name>
      <value>true</value>
  </property>
  <property>
      <name>yarn.timeline-service.http-cross-origin.enabled</name>
      <value>true</value>
  </property>
  <property>
      <name>yarn.resourcemanager.webapp.cross-origin.enabled</name>
      <value>true</value>
  </property>
  <property>
      <name>yarn.nodemanager.webapp.cross-origin.enabled</name>
      <value>true</value>
  </property>
  <property>
      <name>yarn.timeline-service.enabled</name>
      <value>true</value>
  </property>
  <property>
      <name>yarn.resourcemanager.system-metrics-publisher.enabled</name>
      <value>true</value>
  </property>
  <property>
      <name>yarn.timeline-service.generic-application-history.enabled</name>
      <value>true</value>
  </property>
</configuration>

Step 8A: Prevent Native Library Warning
=======================================
# Append the following to log4j.properties using the commands below:
  echo '# Remove Native code warning' >> /opt/hadoop-3.3.6/etc/hadoop/log4j.properties 
  echo 'log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR' >> /opt/hadoop-3.3.6/etc/hadoop/log4j.properties

Step 9: Modify Java Heap Sizes
==============================
# These are now auto scaled based on machine memory, just in case
# some low default values will be set.

# Edit /opt/hadoop-3.3.6/etc/hadoop/hadoop-env.sh file to reflect the following 

  # The maximum amount of heap to use, in MB. Default is 1000.
  HADOOP_HEAPSIZE=500
  HADOOP_NAMENODE_INIT_HEAPSIZE="500"

# Add the following to the end of /opt/hadoop-3.3.6/etc/hadoop/hadoop-env.sh:

  export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
  export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$HADOOP_HOME/lib/native"

#  Next, in the same directory, edit the mapred-env.sh to reflect the following:

  HADOOP_JOB_HISTORYSERVER_HEAPSIZE=250

# Finally, in the same directory, edit yarn-env.sh to reflect the following:

  YARN_HEAPSIZE=500
  JAVA_HEAP_MAX=-Xmx500m 


Step 10: Configure Systemd
==========================

# Systemd will automate the start-up of all the Hadoop services. The following
# steps will install a systemd service for hadoop i.e. you can use
#   systemctl start hadoop
#   systemctl stop hadoop
#   systemctl restart hadoop
# and other systemd commands to start and stop the Hadoop services.

  mkdir -p /opt/services
  cp scripts/start-hdfs.sh /opt/services	
  cp scripts/stop-hdfs.sh /opt/services	
  cp scripts/start-yarn.sh /opt/services	
  cp scripts/stop-yarn.sh /opt/services	
  cp scripts/start-mapred.sh /opt/services	
  cp scripts/stop-mapred.sh /opt/services	
  cp scripts/startup-hadoop-services /opt/services	
  cp scripts/shutdown-hadoop-services /opt/services	
  chmod 744 /opt/services/*

  cp files/hadoop.service /etc/systemd/system
  chmod 644 /etc/systemd/system/hadoop.service

# systemd command to enable hadoop service startup
  systemctl enable hadoop.service

Step 11: Format HDFS
====================
# As user "hdfs" 

 su - hdfs

 cd /opt/hadoop-3.3.6/bin
 ./hdfs namenode -format

# If the command worked, you should see the following near the end of a long list of messages:

  INFO common.Storage: Storage directory /var/data/hadoop/hdfs/nn has been successfully formatted.

# return to root user

  exit

Step 12: Start the HDFS Services
================================
# If you used the install script or performed Step 10: Configure Systemd, you can start all
# services (both in this step (12) and the next step (13) with the following
# command

  systemctl start hadoop

# If all daemons start (see the "jps" command below) then there is no need to 
# perform the following steps. Consult the /tmp/hadoop-service-startup.log 
# for startup progress. 

# The systemd hadoop service will start all the Hadoop services this step (12) and 
# the next step (13) when the VM boots. There is no need to perform these steps
# as a user. 

Start and Check Separate Daemons
--------------------------------
# If you want to check and start each daemon separately, follow the commands below
# NOTE: These steps are not needed if you used the "systemctl" command above.

# As user hdfs

  su - hdfs
  cd /opt/hadoop-3.3.6/bin
  ./hdfs --daemon start namenode
  ./hdfs --daemon start secondarynamenode
  ./hdfs --daemon start  datanode

# If the daemon started, you should see responses above that will point to the log file
# located at /opt/hadoop-3.3.6/logs

# (Note that the actual log file is appended with ".log" not ".out.")
# Issue a jps command to see that all the services are running. The actual PID 
# values will be different than shown in this listing:

  $ jps
  15140 SecondaryNameNode
  15015 NameNode
  15335 Jps
  15214 DataNode

# All HSFS services can be stopped using the hdfs command.  
# For example, to stop the datanode service enter the following

   ./hdfs --daemon stop datanode

# The same can be done for the Namenode and SecondaryNameNode
# If you did not use systemctl to start all the daemons, (see above) then 
# you will need to stop then "by hand" as shown above. 

Complete HDFS installation
--------------------------
# Once HDFS is working, some directories need to be created.
# Create /mr-history for job history server directory in hdfs
# (Also a good test to make sure HDFS is working)
# Two user directories are also created; 
#  hdfs - the HDFS root user
#  hands-on - a user directory for running examples
# change to user "hdfs" (if not already) and create the following in HDFS

  su - hdfs

  hdfs dfs -mkdir -p /mr-history/tmp
  hdfs dfs -mkdir -p /mr-history/done
  hdfs dfs -chown -R yarn:hadoop  /mr-history
  hdfs dfs -mkdir /tmp
  hdfs dfs -chmod go+rwx /tmp
  hdfs dfs -mkdir -p /user/hdfs
  hdfs dfs -mkdir -p /user/hands-on
  hdfs dfs -chown -R hands-on:hadoop  /user/hands-on
  exit 

# you should be back at the root prompt "#"

HDFS Convenience Scripts
------------------------
# There are two "convenience" scripts to start and stop HDFS services (run as root)
# These are run as part of the "systemctl start hadoop" command.
# The scripts are located at /opt/services/
  
  start-hdfs.sh  
  stop-hdfs.sh

# these scripts perform starting and stopping the NameNode, SecondaryNameNode, and the DataNode
# as a group. These scripts should not be needed in the normal course of events. 
# See "Systemd Control" below.

Step 13: Start YARN Services
============================
# If you used the install script or performed Step 10: Configure Systemd, you can start all
# services (both in this step (13) and the previous step (12) with the following
# command (this will start all Hadoop services, only run once, and on the
# next reboot, it will start automatically)

  systemctl start hadoop

# If all daemons start (see the "jps" command below) then there is no need to
# perform the following steps. Consult the /tmp/hadoop-service-startup.log
# for startup progress.

# The systemd hadoop service will start all the Hadoop services this step (13) and
# the previous step (12) when the VM boots. There is no need to perform these steps
# as a user.

# If you want to check and start each daemons separately, follow the commands below
# These steps are not needed if you used the "systemctl" command above.
# as user "yarn"

  su - yarn
  cd /opt/hadoop-3.3.6/bin
  ./yarn --daemon start resourcemanager
  ./yarn --daemon start nodemanager
  ./yarn --daemon start timelineserver

  jps
  23090 Jps
  20546 ApplicationHistoryServer
  19987 ResourceManager
  20080 NodeManager
  21944 JobHistoryServer

# Similar to HDFS, the Yarn services can be stopped by issuing a stop argument to the yarn command:
  
  ./yarn --daemon stop nodemanager

# The same can be done for the ResourceManager and the ApplicationHistoryServer
# If you did not use systemctl to start all the daemons (see above), then
# you will need to stop then "by hand" as shown above.

# There are two convenience scripts to start and stop YARN services (run as root)
# The scripts are located at /opt/services/
  
  start-yarn.sh            
  stop-yarn.sh

# these scripts perform starting and stopping the NodeManager, ResourceManager, and  
# ApplicationHistoryServer as a group. These scripts should not be needed in the 
# normal course of events. See "Systemd Control" below.


Step 14: Verify the Running Services Using the Web Interface
============================================================
# 


# To see the HDFS web interface (other browsers can be used):

 firefox  http://localhost:50070

# To see the ResourceManager (YARN) web interface:

 firefox http://localhost:8088

# Run a Sample MapReduce Examples as user "hdfs"

  su - hdfs
  export YARN_EXAMPLES=/opt/hadoop-3.3.6/share/hadoop/mapreduce/

# To test your installation, run the sample "pi" application
 
 yarn jar $YARN_EXAMPLES/hadoop-mapreduce-examples-3.3.6.jar pi 8 100000

If these tests worked, the Hadoop installation should be working correctly. 


Step 15: Systemd Control of Hadoop Components
=============================================

# There are two scripts in /opt/services that are used to startup and shutdown
# all the Hadoop daemons mentions above (HDFS and Yarn). These are:

  shutdown-hadoop-services  
  startup-hadoop-services 

# these are primarily used by systemd to start and stop all the Hadoop daemons.
# They can be run from the command line if needed, however, it is best to 
# use systemd to manage all the daemons (although the way these scripts 
# are structured, starting daemons is done as a group so individual issues
# may not show up or be recorded by systemd). The /tmp/hadoop-service-startup.log
# may give some indication if there were problems with individual daemon startup,
# however, if a daemon is not starting, check the daemon log file in the
# /var/log/hadoop directory.

Step 16: Give hands-on User sudo permissions and password
=========================================================

# set the hands-on password

echo "hands-on:minimal" | chpasswd

# As root, edit the /etc/sudoers file:

  visudo

# Add the following line

  hands-on        ALL=(ALL)       ALL

# after these lines

  ## Allow root to run any commands anywhere
  root    ALL=(ALL)       ALL

# The new lines should look like:

  ## Allow root to run any commands anywhere
  root    ALL=(ALL)       ALL
  hands-on        ALL=(ALL)       ALL

# Save and exit visudo ("wq") 
# the next time hands-on logs in they will have full sudo permissions.
# They can escalate to root using:

  sudo su -

-------------------
Install Apache Hive 
-------------------
OS: Linux
Platform: Rocky 9
Hive Version: 4.0.1
Derby Version: 10.14.2.0

Step 1: Install and Configure Hive
==================================

# As root, get sources, extract, create /etc/profile.d/hive.sh

  wget -P /tmp https://archive.apache.org/dist/hive/hive-4.0.1/apache-hive-4.0.1-bin.tar.gz

  tar xvzf /tmp/apache-hive-4.0.1-bin.tar.gz -C /opt

  echo 'export PATH=$PATH:/opt/apache-hive-4.0.1-bin/bin; export HIVE_HOME=/opt/apache-hive-4.0.1-bin' >/etc/profile.d/hive.sh

# make needed directories in HDFS

  su - hdfs -c "hdfs dfs -mkdir -p /user/hive/warehouse"
  su - hdfs -c "hdfs dfs -chmod 755 /user/hive/warehouse"
  su - hdfs -c"hdfs dfs -chown -R hive:hadoop  /user/hive"

# Copy hive configuration files from files directory to hive install path
  
  /bin/cp files/hive-site.xml /opt/apache-hive-4.0.1-bin/conf
  /bin/cp files/hive-env.sh /opt/apache-hive-4.0.1-bin/conf
  /bin/cp files/hive-log4j2.properties /opt/apache-hive-4.0.1-bin/conf
  /bin/cp files/parquet-logging.properties /opt/apache-hive-4.0.1-bin/conf
# sett all files to 644
  chmod 644 /opt/apache-hive-4.0.1-bin/conf/*
# Create a Hive user and change ownership (do as root)

  useradd -g hadoop hive
  chown -R hive:hadoop /opt/apache-hive-4.0.1-bin

# set up log directory

  mkdir /var/log/hive
  chown hive:hadoop /var/log/hive
  chmod ug+rwx /var/log/hive
  chmod +t /var/log/hive

# remove the extra log4j-slf4j, guava library (included in Hadoop install)

  mv /opt/apache-hive-4.0.1-bin/lib/log4j-slf4j-impl-2.18.0.jar /opt/apache-hive-4.0.1-bin/lib/log4j-slf4j-impl-2.18.0.jar.extra
  mv /opt/apache-hive-4.0.1-bin/lib/guava-22.0.jar /opt/apache-hive-4.0.1-bin/lib/guava-22.0.jar.extra
 cp /opt/hadoop-3.3.6/share/hadoop/common/lib/guava-27.0-jre.jar /opt/apache-hive-4.0.1-bin/lib/

  cp scripts/start-hive-metastore.sh /opt/services 
  cp scripts/stop-hive-metastore.sh /opt/services

  cp scripts/start-hiveserver2.sh /opt/services 
  cp scripts/stop-hiveserver2.sh  /opt/services
  chmod 744 /opt/services/*
  chown hive:hadoop  /opt/services/start-hive*  /opt/services/stop-hive*

# set up systemd to start hive-meta-server and hiveserver2

  cp files/hive-metastore.service /etc/systemd/system
  cp files/hiveserver2.service /etc/systemd/system
  
  chmod 644 /etc/systemd/system/hive-metastore.service 
  chmod 644 /etc/systemd/system/hiveserver2.service
 
  systemctl enable hive-metastore.service
  systemctl enable hiveserver2.service

Step 2: Use MariaDB (MySQL) for the Hive Metastore
==================================================
# Hive requires a "metastore" database for metadata about the database. 
# The default is Apache Derby, however, since MariaDB is already installed 
# and more robust than Derby, we can use it for metastore database.


Set up MariaDB
--------------
# For simplicity, the root login for MariaDB has no password, as root perform the following
# The process creates the metastore_db in MariaDB with the "hiveusr" with password "hive"

  mysql -u root 
  MariaDB [(none)]> CREATE DATABASE metastore_db;
  USE metastore_db;
  SOURCE /opt/apache-hive-4.0.1-bin/scripts/metastore/upgrade/mysql/hive-schema-4.0.0.mysql.sql;
  CREATE USER 'hiveusr' IDENTIFIED BY 'hive';
  GRANT all on *.* to 'hiveusr'@localhost identified by 'hive';
  flush privileges;
  exit

Configure Hive
---------------
# The default hive-site.xml is configured for MariaDB and there should be no other configuration needed. 

Alternate: Install Apache Derby
===============================

# Hive uses Derby as a the default metastore. Aside from adding another package to the LHM, Derby seems
# to have some issues operating as a Hive metastore. The use of Derby is not recommended

  wget -P /tmp  https://archive.apache.org/dist/db/derby/db-derby-10.14.2.0/db-derby-10.14.2.0-bin.tar.gz
  tar xvzf /tmp/db-derby-10.14.2.0-bin.tar.gz -C /opt

# set the Derby environment defines. Note, derby database will be in $DERBY_HOME/data, change as needed.

  echo 'export DERBY_HOME=/opt/db-derby-10.14.2.0-bin; export PATH=$DERBY_HOME/bin:$PATH; export DERBY_OPTS="-Dderby.system.home=$DERBY_HOME/data"'>/etc/profile.d/derby.sh

# create the derby data and log directory

  mkdir /opt/db-derby-10.14.2.0-bin/data
  mkdir /opt/db-derby-10.14.2.0-bin/logs

# make log directory

  mkdir /var/log/derby
  chown hive:hadoop /var/log/derby

# source these file to make sure $DERBY_HOME and $HIVE_HOME are defined

  source /etc/profile.d/derby.sh
  source /etc/profile.d/hive.sh

# Change derby to hive user

  chown -R hive:hadoop /opt/db-derby-10.14.2.0-bin/

# copy these libraries to $HIVE_HOME

  cp $DERBY_HOME/lib/derbyclient.jar $HIVE_HOME/lib
  cp $DERBY_HOME/lib/derbytools.jar $HIVE_HOME/lib

# Test starting (and stopping) Derby (nohup will leave log file in the directory you run command)
#  Run as root.

  nohup startNetworkServer -h 0.0.0.0 &

# To stop use (as root)

   stopNetworkServer

# set up systemd 

  cp files/derby.service /etc/systemd/system
  chmod 644 /etc/systemd/system/derby.service
  systemctl enable derby.service

# copy derby start and stop file, change owner to hive

  cp scripts/start-derby.sh /opt/services
  cp scripts/stop-derby.sh /opt/services
  chmod 744 /opt/services/*-derby.sh  
  chown hive:hadoop /opt/services/*-derby.sh  

# Star the Derby server and configure Hive schema (This is a Hive command)

  systemctl start derby 

  schematool -initSchema -dbType derby

# get info and check database

  schematool -info -dbType derby
  schematool -validate -dbType derby

# look at the database

  ij
  ij version 10.14
  ij> connect 'jdbc:derby://localhost:1527/metastore_db';
  ij> show tables;
  TABLE_SCHEM         |TABLE_NAME                    |REMARKS             
  ------------------------------------------------------------------------
  SYS                 |SYSALIASES                    |                    
  SYS                 |SYSCHECKS                     |                    
  SYS                 |SYSCOLPERMS                   |                    
  ... more

# There are two convenience scripts in the /opt/services directory to start and stop Derby 
# (run as root) These are used by systemd to start and stop Derby. There should be
# not need to run these "by hand."

# NOTE: start-derby.sh deletes any lock files in $DERBY_HOME/data/metastore_db/*.lck
# these sometimes cause issues restarting the database
  
  start-derby.sh  
  stop-derby.sh

Important:
----------

If using Derby use the hive-site.xml.derby. Copy this file to the conf directory

  /bin/cp files/hive-site.xml.derby /opt/apache-hive-4.0.1-bin/conf/hive-site.xml

If using derby use the hive-site.xml.derby. copy this file to 

Step 3: Start/Test Hive
=======================
# NOTE: In order for Spark to read/write Hive tables, a hiveserver2 and hivemetastore
# daemons must be running.  These services are started with following scripts
# (run automatically at boot by /etc/rc.local)
  start-hive-metastore.sh
  start-hiveserver2.sh

# Make sure all Hadoop services are running (see above, Steps 11 and 12).
# As user hdfs

  su - hdfs

# Enter "hive" at prompt. The Hive prompt connects to the beeline server (Hiveseerver2) and
# requires authentication using the i"!connect jdbc:hive2://localhost:10000 -n hdfs" argument
# the "-n" is the user name
# Output as follows. 

  hive 
  Beeline version 4.0.1 by Apache Hive
  beeline> !connect jdbc:hive2://localhost:10000 -p hdfs 
  Connecting to jdbc:hive2://localhost:10000
  Connected to: Apache Hive (version 4.0.1)
  Driver: Hive JDBC (version 4.0.1)
  Transaction isolation: TRANSACTION_REPEATABLE_READ
  0: jdbc:hive2://localhost:10000> show tables;
  INFO  : Compiling command(queryId=hive_20260116105154_a331ea17-5e6b-45de-be24-5f10fceb7ba8): show tables
  INFO  : Semantic Analysis Completed (retrial = false)
  INFO  : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:tab_name, type:string, comment:from deserializer)], properties:null)
  INFO  : Completed compiling command(queryId=hive_20260116105154_a331ea17-5e6b-45de-be24-5f10fceb7ba8); Time taken: 0.008 seconds
  INFO  : Concurrency mode is disabled, not creating a lock manager
  INFO  : Executing command(queryId=hive_20260116105154_a331ea17-5e6b-45de-be24-5f10fceb7ba8): show tables
  INFO  : Starting task [Stage-0:DDL] in serial mode
  INFO  : Completed executing command(queryId=hive_20260116105154_a331ea17-5e6b-45de-be24-5f10fceb7ba8); Time taken: 0.007 seconds
  +-----------+
  | tab_name  |
  +-----------+
  +-----------+
  No rows selected (0.057 seconds)
  0: jdbc:hive2://localhost:10000> 

# Enter "!quit" or "ctrl-d" to exit

# The INFO messages can be silenced by add the "--silent" argument to the hive command.

  hive --silent
  beeline> !connect jdbc:hive2://localhost:10000 -p hdfs 
  0: jdbc:hive2://localhost:10000> show tables;
  +-----------+
  | tab_name  |
  +-----------+
  +-----------+
  0: jdbc:hive2://localhost:10000> 


Step 4: Install Parquet Tools (optional)
========================================

# When using Hive tables with other tools (e.g. PySpark) it may
# necessary to obtain the schema of a Parquet formatted database table. 

# Parquet is an open source (portable) file format available to any project
# in the Hadoop ecosystem. Apache Parquet is designed for efficient as 
# well as performant flat columnar storage format of data compared 
# to row based files like CSV or TSV files.

# Parquet-tools is useful for examining parquet files

# make path for the jar file
mkdir /opt/parquet-tools
# pull down the jar file
wget -P /opt/parquet-tools https://repo1.maven.org/maven2/org/apache/parquet/parquet-tools/1.11.2/parquet-tools-1.11.2.jar

#add parquet-tools.sh helper script
cp scripts/parquet-tools.sh /opt/parquet-tools
chmod 755 /opt/parquet-tools/parquet-tools.sh 
chown -R yarn:hadoop /opt/parquet-tools

# add path for parquet-tools.sh

echo 'export PT_HOME=/opt/parquet-tools; export PATH=$PATH:$PT_HOME;' >/etc/profile.d/parquet-tools.sh
