安装 mysql
sudo apt-get update
sudo apt-get install mysql-server
sudo apt-get install mysql-client
# 启动
sudo mysql -u root -p
CREATE DATABASE hive;
USE hive;
CREATE USER 'hive'@'localhost' IDENTIFIED BY 'hive';
GRANT ALL ON hive.* TO 'hive'@'localhost' IDENTIFIED BY 'hive';
GRANT ALL ON hive.* TO 'hive'@'%' IDENTIFIED BY 'hive';
FLUSH PRIVILEGES;
安装 hadoop
sudo apt-get install ssh
sudo apt-get install rsync
tar -xzvf hadoop-2.7.6.tar.gz
ln -s /home/yzchen/hadoop hadoop
export JAVA_HOME=/home/yzchen/jdk
export HADOOP_HOME=/home/yzchen/hadoop
export PATH="/home/yzchen/hadoop/bin:$PATH"
mkdir home/yzchen/hadoop/tmp
mkdir home/yzchen/hadoop/hdfs
mkdir home/yzchen/hadoop/hdfs/name
mkdir home/yzchen/hadoop/hdfs/data
配置 ssh 本地登录
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 0600 ~/.ssh/authorized_keys
ssh localhost
配置文件
etc/hadoop/core-site.xml:
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/home/yzchen/hadoop/tmp</value>
</property>
</configuration>
etc/hadoop/hdfs-site.xml:
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>/home/yzchen/hadoop/hdfs/name</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>/home/yzchen/hadoop/hdfs/data</value>
</property>
</configuration>
etc/hadoop/yarn-site.xml:
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
启动
bin/hdfs namenode -format
sbin/start-dfs.sh
sbin/start-yarn.sh
自动启动
添加文件 hadoop、yarn 到/etc/init.d
#! /bin/sh
### BEGIN INIT INFO
# Provides: hadoop
# Required-Start: $local_fs $remote_fs $network $syslog
# Required-Stop: $local_fs $remote_fs $network $syslog
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: start hadoop
### END INIT INFO
export JAVA_HOME=/home/yzchen/jdk
case $1 in
start) su yzchen /home/yzchen/hadoop/sbin/start-dfs.sh;;
stop) su yzchen /home/yzchen/hadoop/sbin/stop-dfs.sh;;
*) echo "require start|stop" ;;
esac
exit 0
#! /bin/sh
### BEGIN INIT INFO
# Provides: yarn
# Required-Start: $local_fs $remote_fs $network $syslog
# Required-Stop: $local_fs $remote_fs $network $syslog
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: start yarn
### END INIT INFO
export JAVA_HOME=/home/yzchen/jdk
case $1 in
start) su yzchen /home/yzchen/hadoop/sbin/start-yarn.sh;;
stop) su yzchen /home/yzchen/hadoop/sbin/stop-yarn.sh;;
*) echo "require start|stop" ;;
esac
exit 0
添加自动启动服务
export LC_ALL=en_US.UTF-8
sudo chmod 755 hadoop
sudo update-rc.d hadoop defaults 90
sudo chmod 755 yarn
sudo update-rc.d yarn defaults 90
安装 hive
wget http://mirrors.shu.edu.cn/apache/hive/hive-2.3.4/apache-hive-2.3.4-bin.tar.gz
tar -xzvf apache-hive-2.3.4-bin.tar.gz
ln -s /home/yzchen/apache-hive-2.3.4-bin hive
export HIVE_HOME="/home/yzchen/hive"
export path="/home/yzchen/hive/bin:$PATH"
配置文件
cd hive/conf
cp hive-default.xml.template hive-site.xml
# 讲相对路径${system:java.io.tmpdir{/${system:user.name}改为绝对路径/home/yzchen/hive/tmp/hive
<property>
<name>hive.exec.scratchdir</name>
<value>/user/hive/tmp</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<property>
<name>hive.querylog.location</name>
<value>/user/hive/log</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true&characterEncoding=UTF-8&useSSL=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive</value>
</property>
<property>
<name>hive.metastore.uris</name> <value>thrift://localhost:9083</value>
</property>
下载文件 mysql-connector-java-5.1.43.jar 放到 hive/lib 下
初始化元数据库
sudo apt-get install libmysql-java
ln -s /usr/share/java/mysql-connector-java.jar $HIVE_HOME/lib/mysql-connector-java.jar
mysql -u root -p
source hive/scripts/metastore/upgrade/mysql/hive-schema-2.3.0.mysql.sql
bin/schematool -dbType mysql -initSchema
启动 metastore 服务
hive --service metastore
安装 scala
tar -xvzf scala-2.11.8.tgz
ln -s /home/yzchen/scala-2.11.8 scala
export SCALA_HOME="/home/yzchen/scala"
export PATH="$SCALA_HOME/bin:$PATH"
安装 spark
wget https://archive.apache.org/dist/spark/spark-2.0.2/spark-2.0.2-bin-hadoop2.7.tgz
tar -xvzf spark-2.0.2-bin-hadoop2.7.tgz
ln -s /home/yzchen/spark-2.0.2-bin-hadoop2.7 spark
export SPARK_HOME=/home/yzchen/spark
export PATH=$SPARK_HOME/bin:$PATH
修改配置文件
cp spark-env.sh.template spark-env.sh
vi spark-env.sh
export SPARK_MASTER_IP=cloud #主机名
export SPARK_WORKER_CORES=2
export SPARK_WORKER_MEMORY=2g
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
修改 slaves 文件
cp slaves.template slaves
vi slaves
127.0.1.1 cloud
python3.6 不兼容
用 conda 降级为 3.5
conda create -n python35
source activate python35
3. 在这个新的开发环境中安装python 3.5:
设置 ipython
在文件中 spark/conf/spark-env.sh 加入
export PYSPARK_DRIVER_PYTHON="ipython"
在 jupyter 中调用 pyspark
import sys, os
spark_home = os.environ.get("SPARK_HOME", None)
if not spark_home:
raise ValueError("spark environment variable not set")
sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.4-src.zip"))
exec(open(os.path.join(spark_home, "python/pyspark/shell.py")).read())
连接 hive
# 复制hive/conf/hive-site.xml到spark/conf下
cp hive/conf/hive-site.xml sprak/conf
# 复制mysql-connector-java.jar到spark/jars下
ln -s /usr/share/java/mysql-connector-java.jar $SPARK_HOME/jars/mysql-connector-java.jar
# 启动spark
spark-shell --jars $SPARK_HOME/jars/mysql-connector-java.jar