Java tutorial
/* # # Copyright 2012 The Trustees of Indiana University # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ----------------------------------------------------------------- # # Project: knn # File: ParallelDataCopyMapper.java # Description: # # ----------------------------------------------------------------- # */ package edu.indiana.d2i.htrc.io; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.StringReader; import java.net.URI; import java.util.HashSet; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Mapper.Context; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.mahout.common.ClassUtils; import org.apache.mahout.common.StringTuple; import org.apache.mahout.vectorizer.DefaultAnalyzer; import org.apache.mahout.vectorizer.DocumentProcessor; public class DataCopyTokenizerMapper extends Mapper<Text, Text, Text, StringTuple> { private Analyzer analyzer; private static enum DataCopyTokenizer { numTERMS, CPUTIME } private long elapsedTime = 0; private long numTerms = 0; @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { long initCPU = System.nanoTime(); TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(value.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = new String(termAtt.buffer(), 0, termAtt.length()); document.add(term); numTerms++; } } elapsedTime += System.nanoTime() - initCPU; context.write(key, document); } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); analyzer = ClassUtils.instantiateAs( context.getConfiguration().get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()), Analyzer.class); // Configuration conf = context.getConfiguration(); // URI[] localFiles = DistributedCache.getCacheFiles(conf); // if (localFiles == null || localFiles.length == 0) // throw new RuntimeException( // "Cannot find paths from distribute cache."); // // Path dictionaryFile = new Path(localFiles[0].getPath()); // FileSystem fs = FileSystem.get(conf); // BufferedReader reader = new BufferedReader(new InputStreamReader( // fs.open(dictionaryFile))); // String term = null; // while ((term = reader.readLine()) != null) { // dictionary.add(term.toLowerCase()); // } // reader.close(); } @Override protected void cleanup(Context context) throws IOException, InterruptedException { context.getCounter(DataCopyTokenizer.CPUTIME).increment(elapsedTime); context.getCounter(DataCopyTokenizer.numTERMS).increment(numTerms); } }