10000 TEZ-4631: Include an official script that installs hadoop and tez and runs a simple example DAG by abstractdog · Pull Request #414 · apache/tez · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

TEZ-4631: Include an official script that installs hadoop and tez and runs a simple example DAG #414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 28, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions dev-support/bin/tez_run_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@

# This script is used to set up a local Hadoop and Tez environment for running a simple word count example.
# Prerequisites
# 1. java is installed and JAVA_HOME is set
# 2. ssh localhost works without password

# All parameters are optional:
# TEZ_VERSION: defaults to the latest version available on the Apache Tez download page
# HADOOP_VERSION: defaults to the version which belongs to the TEZ_VERSION
# TEZ_EXAMPLE_WORKING_DIR: defaults to the current working directory

# TEZ_VERSION comes from environment variable or is fetched from the Apache Tez download page
export TEZ_VERSION=${TEZ_VERSION:=$(curl -s "https://downloads.apache.org/tez/" | grep --color=never -o '[0-9]\+\.[0-9]\+\.[0-9]\+' | sed -n '/\/$/!p' | sort -V | tail -1)} # e.g. 0.10.4
export TEZ_EXAMPLE_WORKING_DIR=${TEZ_EXAMPLE_WORKING_DIR:=$PWD}
cd $TEZ_EXAMPLE_WORKING_DIR

echo "TEZ_VERSION: $TEZ_VERSION"
wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz

# Need to extract the Tez tarball early to get hadoop version it depends on
if [ ! -d "apache-tez-$TEZ_VERSION-bin" ]; then
tar -xzf apache-tez-$TEZ_VERSION-bin.tar.gz
fi

export HADOOP_VERSION=${HADOOP_VERSION:=$(basename apache-tez-$TEZ_VERSION-bin/lib/hadoop-hdfs-client-*.jar | sed -E 's/.*hadoop-hdfs-client-([0-9]+\.[0-9]+\.[0-9]+)\.jar/\1/')} # e.g. 3.4.1

cat <<EOF
***
*** Demo setup script is running in $TEZ_EXAMPLE_WORKING_DIR ***
*** TEZ version: $TEZ_VERSION
*** HADOOP version $HADOOP_VERSION
***
EOF

wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz

if [ ! -d "hadoop-$HADOOP_VERSION" ]; then
tar -xzf hadoop-$HADOOP_VERSION.tar.gz
fi

export HADOOP_HOME=$TEZ_EXAMPLE_WORKING_DIR/hadoop-$HADOOP_VERSION
export TEZ_HOME=$TEZ_EXAMPLE_WORKING_DIR/apache-tez-$TEZ_VERSION-bin
export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf

export PATH=$PATH:$HADOOP_HOME/bin

# https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation
cat <<EOF > $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
EOF

cat <<EOF > $HADOOP_HOME/etc/hadoop/core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
EOF

cat <<EOF > $HADOOP_HOME/etc/hadoop/yarn-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
EOF

# optionally stop previous clusters if any
$HADOOP_HOME/sbin/stop-dfs.sh
$HADOOP_HOME/sbin/stop-yarn.sh

rm -rf /tmp/hadoop-$USER/dfs/data
hdfs namenode -format -force

$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh

hadoop fs -mkdir -p /apps/tez-$TEZ_VERSION
hadoop fs -copyFromLocal $TEZ_HOME/share/tez.tar.gz /apps/tez-$TEZ_VERSION

# create a simple tez-site.xml
cat <<EOF > $TEZ_HOME/conf/tez-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>tez.lib.uris</name>
<value>/apps/tez-$TEZ_VERSION/tez.tar.gz</value>
</property>
</configuration>
EOF

# create a simple input file
cat <<EOF > ./words.txt
Apple
Banana
Car
Apple
Banana
Car
Dog
Elephant
Friend
Game
EOF

hadoop fs -copyFromLocal words.txt /words.txt

export HADOOP_USER_CLASSPATH_FIRST=true
# finally run the example
yarn jar $TEZ_HOME/tez-examples-$TEZ_VERSION.jar orderedwordcount /words.txt /words_out

# check the output
hadoop fs -ls /words_out
hadoop fs -text /words_out/part-v002-o000-r-00000


cat <<EOF
*** Since the environment is already set up, you can rerun the DAG using the commands below.

export HADOOP_USER_CLASSPATH_FIRST=true
export TEZ_HOME=$TEZ_EXAMPLE_WORKING_DIR/apache-tez-$TEZ_VERSION-bin
export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf
$HADOOP_HOME/bin/yarn jar $TEZ_HOME/tez-examples-$TEZ_VERSION.jar orderedwordcount /words.txt /words_out

*** You can also visit some of the sites that are set up during the script execution.

Yarn RM: http://localhost:8088
HDFS NN: http://localhost:9870

EOF
0