eclipse配置hadoop2.7.2开发环境并本地跑起来详解大数据

　　先安装并启动hadoop，怎么弄见上文http://www.cnblogs.com/wuxun1997/p/6847950.html。这里说下怎么设置IDE来开发hadoop代码和调试。首先要确保你本地装了eclipse，再下个eclipse的hadoop插件就完事了。下面细说一下：

　　1、到http://download.csdn.net/detail/wuxun1997/9841487下载eclipse插件并丢到eclipse的pulgin目录下，重启eclipse，Project Explorer出现DFS Locations；

　　2、点击Window->点Preferences->点Hadoop Map/Reduce->填D:/hadoop-2.7.2并OK；

　　3、点击Window->点Show View->点MapReduce Tools下的Map/Reduce Locations->点右边角一个带+号的小象图标"New hadoop location"->eclipse已填好默认参数，但以下几个参数需要修改以下，参见上文中的两个配置文件core-site.xml和hdfs-site.xml：

　　General->Map/Reduce(V2) Master->Port改为9001

　　General->DSF Master->Port改为9000

　　Advanced paramters->dfs.datanode.data.dir改为ffile:/hadoop/data/dfs/datanode

　　Advanced paramters->dfs.namenode.name.dir改为file:/hadoop/data/dfs/namenode

　　4、点击Finish后在DFS Locations右键点击左边三角图标，出现hdsf文件夹，可以直接在这里操作hdsf，右键点击文件图标选"Create new Dictionery"即可新增，再次右键点击文件夹图标选Reflesh出现新增的结果；此时在localhost:50070->Utilities->Browse the file system也可以看到新增的结果；

　　5、新建hadoop项目：File->New->Project->Map/Reduce Project->next->输入自己取的项目名如hadoop再点Finish

　　6、这里的代码演示最常见的分词例子，统计的是中文小说里的人名并降序排列。为了中文分词需要导入一个jar，在这里下载http://download.csdn.net/detail/wuxun1997/9841659。项目结构如下:

hadoop

|--src

|--com.wulinfeng.hadoop.wordsplit

|--WordSplit.java

|--IKAnalyzer.cfg.xml

|--myext.dic

|--mystopword.dic

WordSplit.java

package com.wulinfeng.hadoop.wordsplit; 
 
import java.io.IOException; 
import java.io.StringReader; 
 
import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.fs.FileSystem; 
import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.io.IntWritable; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.io.WritableComparable; 
import org.apache.hadoop.mapreduce.Job; 
import org.apache.hadoop.mapreduce.Mapper; 
import org.apache.hadoop.mapreduce.Reducer; 
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 
import org.apache.hadoop.mapreduce.lib.map.InverseMapper; 
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 
import org.wltea.analyzer.core.IKSegmenter; 
import org.wltea.analyzer.core.Lexeme; 
 
public class WordSplit { 
     
    /** 
     * map实现分词 
     * @author Administrator 
     * 
     */ 
    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { 
        private static final IntWritable one = new IntWritable(1); 
        private Text word = new Text(); 
 
        public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) 
                throws IOException, InterruptedException { 
            StringReader input = new StringReader(value.toString()); 
            IKSegmenter ikSeg = new IKSegmenter(input, true); // 智能分词 
            for (Lexeme lexeme = ikSeg.next(); lexeme != null; lexeme = ikSeg.next()) { 
                this.word.set(lexeme.getLexemeText()); 
                context.write(this.word, one); 
            } 
        } 
    } 
 
    /** 
     * reduce实现分词累计 
     * @author Administrator 
     * 
     */ 
    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { 
        private IntWritable result = new IntWritable(); 
 
        public void reduce(Text key, Iterable<IntWritable> values, 
                Reducer<Text, IntWritable, Text, IntWritable>.Context context) 
                throws IOException, InterruptedException { 
            int sum = 0; 
            for (IntWritable val : values) { 
                sum += val.get(); 
            } 
            this.result.set(sum); 
            context.write(key, this.result); 
        } 
    } 
 
    public static void main(String[] args) throws Exception { 
        Configuration conf = new Configuration(); 
        String inputFile = "/input/people.txt"; // 输入文件 
        Path outDir = new Path("/out"); // 输出目录 
        Path tempDir = new Path("/tmp" + System.currentTimeMillis()); // 临时目录 
 
        // 第一个任务：分词 
        System.out.println("start task..."); 
        Job job = Job.getInstance(conf, "word split"); 
        job.setJarByClass(WordSplit.class); 
        job.setMapperClass(TokenizerMapper.class); 
        job.setCombinerClass(IntSumReducer.class); 
        job.setReducerClass(IntSumReducer.class); 
        job.setOutputKeyClass(Text.class); 
        job.setOutputValueClass(IntWritable.class); 
        FileInputFormat.addInputPath(job, new Path(inputFile)); 
        FileOutputFormat.setOutputPath(job, tempDir); 
 
        // 第一个任务结束，输出作为第二个任务的输入，开始排序任务 
        job.setOutputFormatClass(SequenceFileOutputFormat.class); 
        if (job.waitForCompletion(true)) { 
            System.out.println("start sort..."); 
            Job sortJob = Job.getInstance(conf, "word sort"); 
            sortJob.setJarByClass(WordSplit.class); 
            sortJob.setMapperClass(InverseMapper.class); 
            sortJob.setInputFormatClass(SequenceFileInputFormat.class); 
 
            // 反转map键值，计算词频并降序 
            sortJob.setMapOutputKeyClass(IntWritable.class); 
            sortJob.setMapOutputValueClass(Text.class); 
            sortJob.setSortComparatorClass(IntWritableDecreasingComparator.class); 
            sortJob.setNumReduceTasks(1); 
 
            // 输出到out目录文件 
            sortJob.setOutputKeyClass(IntWritable.class); 
            sortJob.setOutputValueClass(Text.class); 
            FileInputFormat.addInputPath(sortJob, tempDir); 
 
            // 如果已经有out目录，先删再创建 
            FileSystem fileSystem = outDir.getFileSystem(conf); 
            if (fileSystem.exists(outDir)) { 
                fileSystem.delete(outDir, true); 
            } 
            FileOutputFormat.setOutputPath(sortJob, outDir); 
 
            if (sortJob.waitForCompletion(true)) { 
                System.out.println("finish and quit...."); 
                // 删掉临时目录 
                fileSystem = tempDir.getFileSystem(conf); 
                if (fileSystem.exists(tempDir)) { 
                    fileSystem.delete(tempDir, true); 
                } 
                System.exit(0); 
            } 
        } 
    } 
 
    /** 
     * 实现降序 
     *  
     * @author Administrator 
     * 
     */ 
    private static class IntWritableDecreasingComparator extends IntWritable.Comparator { 
        public int compare(WritableComparable a, WritableComparable b) { 
            return -super.compare(a, b); 
        } 
 
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { 
            return -super.compare(b1, s1, l1, b2, s2, l2); 
        } 
    } 
}

IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?> 
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> 
<properties> 
    <comment>IK Analyzer 扩展配置</comment> 
    <!--用户可以在这里配置自己的扩展字典 --> 
    <entry key="ext_dict">myext.dic</entry> 
    <!--用户可以在这里配置自己的扩展停止词字典 --> 
    <entry key="ext_stopwords">mystopword.dic</entry> 
</properties>

myext.dic

高育良 
祁同伟 
陈海 
陈岩石 
侯亮平 
高小琴 
沙瑞金 
李达康 
蔡成功

mystopword.dic

你 
我 
他 
是 
的 
了 
啊 
说 
也 
和 
在 
就

　　这里直接在eclipse跑WordSplit类，右键选择Run as -> Run on hadoop。上面是输入输出都是本地文件，在D盘建一个input目录，里面放个文件名叫people.txt的小说，是网上荡下来的热剧《人民的名义》。为了分词需要设置文件格式：把people.txt去Notepad++里打开，点编码->以UTF-8以无BOM格式编码。在myext.dic里输入一些不想再拆分的人名，在mystopword.dic输入想要过滤掉的一些谓词和助词，跑完去D:/out里看part-r-00000文件即可知道谁是猪脚。

　　如果想把输入输出设置到hdfs也容易，只要把WordSplit.java里的路径加个前缀hdfs://localhost:9000就完事了：

        String inputFile = "hdfs://localhost:9000/input/people.txt"; // 输入文件 
        Path outDir = new Path("hdfs://localhost:9000/out"); // 输出目录 
        Path tempDir = new Path("hdfs://localhost:9000/tmp" + System.currentTimeMillis()); // 临时目录

　　当然你得先把小说传到hdfs上才能跑，可以在cmd里用hdfs命令，也可以直接在eclipse里操作，怎么弄看上面第4步。跑完再点Reflesh可以直接看结果文件。如果要重启hadoop，记得先把eclipse关了，在命令行里起了hadoop再打开eclipse接着玩。

原创文章，作者：ItWorker，如若转载，请注明出处：https://blog.ytso.com/tech/bigdata/7756.html

eclipse配置hadoop2.7.2开发环境并本地跑起来详解大数据

相关推荐

发表回复