一、Hadoop 3.0.4 环境准备
1. 环境要求
- Java 8(Hadoop 3.0.4 不支持 Java 11+)
- 单节点或多节点 Linux 系统(推荐 Ubuntu 18.04+)
- 至少 4GB 内存(建议 8GB+)
- 50GB 以上磁盘空间
2. 安装 Java
# 安装 Java 8
sudo apt-get install openjdk-8-jdk
# 验证安装
java -version
3. 下载与安装 Hadoop 3.0.4
# 下载 Hadoop 3.0.4
wget https://archive.apache.org/dist/hadoop/common/hadoop-3.0.4/hadoop-3.0.4.tar.gz
# 解压到指定目录
sudo tar -zxvf hadoop-3.0.4.tar.gz -C /usr/local/
sudo ln -s /usr/local/hadoop-3.0.4 /usr/local/hadoop
# 设置环境变量
echo "export HADOOP_HOME=/usr/local/hadoop" >> ~/.bashrc
echo "export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH" >> ~/.bashrc
source ~/.bashrc
4. 配置 Hadoop
核心配置文件修改
# 进入配置目录
cd $HADOOP_HOME/etc/hadoop
# 修改 core-site.xml
vi core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/hadoop/tmp</value>
</property>
</configuration>
HDFS 配置
vi hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value> <!-- 单节点部署设为1 -->
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/usr/local/hadoop/tmp/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/usr/local/hadoop/tmp/data</value>
</property>
</configuration>
YARN 配置
vi yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>localhost</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
MapReduce 配置
vi mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_HOME/share/hadoop/mapreduce/*,$HADOOP_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
5. 格式化 HDFS 并启动服务
# 格式化 NameNode
hdfs namenode -format
# 启动 HDFS
start-dfs.sh
# 启动 YARN
start-yarn.sh
# 验证服务状态
jps
正常情况下会看到 NameNode
、DataNode
、ResourceManager
、NodeManager
进程。
二、MapReduce 程序开发
1. 项目结构
使用 Maven 管理依赖,项目结构如下:
mr-demo/
├── src/
│ ├── main/
│ │ ├── java/
│ │ │ └── com/
│ │ │ └── example/
│ │ │ ├── WordCountMapper.java
│ │ │ ├── WordCountReducer.java
│ │ │ └── WordCountDriver.java
│ │ └── resources/
│ └── test/
│ └── java/
└── pom.xml
2. Maven 依赖配置(pom.xml)
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>mr-demo</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<hadoop.version>3.0.4</hadoop.version>
</properties>
<dependencies>
<!-- Hadoop 核心依赖 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.4</version>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
3. 核心代码实现
Mapper 类(WordCountMapper.java)
package com.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 分割文本行成单词
String line = value.toString();
String[] words = line.split(" ");
// 输出单词和计数1
for (String w : words) {
word.set(w);
context.write(word, one);
}
}
}
Reducer 类(WordCountReducer.java)
package com.example;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
// 累加单词出现次数
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
Driver 类(WordCountDriver.java)
package com.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountDriver {
public static void main(String[] args) throws Exception {
// 检查参数
if (args.length != 2) {
System.err.println("Usage: WordCountDriver <input path> <output path>");
System.exit(2);
}
// 创建作业配置
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCountDriver.class);
// 设置Mapper和Reducer类
job.setMapperClass(WordCountMapper.class);
job.setCombinerClass(WordCountReducer.class);
job.setReducerClass(WordCountReducer.class);
// 设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 提交作业并等待完成
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
三、打包与核心JAR生成
1. 生成可执行JAR
# 在项目根目录执行
mvn clean package
2. 核心JAR文件说明
编译完成后,会在target
目录下生成:
mr-demo-1.0-SNAPSHOT.jar
:未包含依赖的应用JARmr-demo-1.0-SNAPSHOT-shaded.jar
:包含所有依赖的胖JAR(推荐使用)
3. Hadoop核心JAR位置
Hadoop 3.0.4的核心JAR位于:
$HADOOP_HOME/share/hadoop/common/
├── hadoop-common-3.0.4.jar
├── hadoop-nfs-3.0.4.jar
└── ...
$HADOOP_HOME/share/hadoop/mapreduce/
├── hadoop-mapreduce-client-core-3.0.4.jar
├── hadoop-mapreduce-client-common-3.0.4.jar
└── ...
$HADOOP_HOME/share/hadoop/hdfs/
├── hadoop-hdfs-3.0.4.jar
└── ...
四、部署与运行
1. 准备输入数据
# 创建输入数据文件
echo "Hello Hadoop MapReduce" > input.txt
echo "Hadoop is a distributed system" > input2.txt
# 上传到HDFS
hdfs dfs -mkdir -p /user/input
hdfs dfs -put input*.txt /user/input
2. 提交MapReduce作业
# 使用胖JAR提交作业
hadoop jar target/mr-demo-1.0-SNAPSHOT-shaded.jar \
com.example.WordCountDriver \
/user/input \
/user/output
3. 查看作业状态
# 通过YARN Web UI查看(默认端口8088)
open http://localhost:8088/cluster
# 或通过命令行查看
yarn application -list
4. 查看作业结果
# 查看输出结果
hdfs dfs -cat /user/output/part-r-00000
五、常见问题与解决方案
1. 权限问题
# 错误信息:Permission denied
hdfs dfs -chmod -R 777 /user
2. 输出目录已存在
# 错误信息:Output directory hdfs://... already exists
hdfs dfs -rm -r /user/output
3. 内存不足
# 修改yarn-site.xml增加内存配置
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>8192</value> <!-- 8GB -->
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>8192</value>
</property>
4. JAR依赖冲突
# 使用shade插件打包时排除冲突依赖
<exclusion>
<groupId>com.example</groupId>
<artifactId>conflict-library</artifactId>
</exclusion>
六、进阶优化
1. 配置MapReduce参数
// 在Driver中添加参数配置
Configuration conf = new Configuration();
conf.set("mapreduce.map.memory.mb", "2048"); // Map任务内存
conf.set("mapreduce.reduce.memory.mb", "4096"); // Reduce任务内存
conf.set("mapreduce.map.cpu.vcores", "2"); // Map任务CPU核心数
2. 自定义分区
// 自定义Partitioner
public class WordCountPartitioner extends Partitioner<Text, IntWritable> {
@Override
public int getPartition(Text key, IntWritable value, int numPartitions) {
return key.toString().length() % numPartitions;
}
}
// 在Driver中设置
job.setPartitionerClass(WordCountPartitioner.class);
job.setNumReduceTasks(4); // 设置4个Reduce任务
七、总结
通过本文的实践,我们完成了Hadoop 3.0.4环境下MapReduce程序的开发与部署,主要包括:
- Hadoop单节点集群的搭建与配置
- MapReduce程序的核心组件开发(Mapper/Reducer/Driver)
- 使用Maven管理依赖并生成可执行JAR
- 作业提交与结果验证
MapReduce作为Hadoop的核心计算模型,虽然在实时计算场景下逐渐被Spark/Flink替代,但仍是大数据处理的基础。掌握MapReduce的原理与开发,有助于深入理解分布式计算的核心思想。
在实际应用中,可根据数据规模和业务需求,进一步优化任务参数、自定义数据类型和开发复杂的联合查询逻辑,充分发挥Hadoop集群的计算能力。