List of usage examples for org.apache.hadoop.mapreduce Job Job
Job(JobConf conf) throws IOException
From source file:com.digitalpebble.behemoth.mahout.DocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * /*w w w.ja va 2s .c om*/ * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param type * The annotation type representing the tokens * @param feature * The name of the features holding the token value * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void tokenizeDocuments(Path input, String type, String feature, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(TOKEN_TYPE, type); conf.set(FEATURE_NAME, feature); Job job = new Job(conf); job.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input); job.setJarByClass(DocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(SequenceFileTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); job.waitForCompletion(true); }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format//from ww w . j a v a2 s .com * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(DictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in * {@link SequenceFile} format//www.j av a2 s . co m */ private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(MIN_SUPPORT, minSupport); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input); job.setJarByClass(DictionaryVectorizer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(TermCountMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setCombinerClass(TermCountCombiner.class); job.setReducerClass(TermCountReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format// ww w . ja v a2 s . co m * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(FixDictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in * {@link SequenceFile} format/*from ww w. j a v a 2 s . com*/ */ private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(MIN_SUPPORT, minSupport); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input); job.setJarByClass(FixDictionaryVectorizer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(TermCountMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setCombinerClass(TermCountCombiner.class); job.setReducerClass(TermCountReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.flytxt.yesbank.processor.HdfsToHbaseEngine.java
License:Open Source License
public static void main(String[] args) throws Exception { if (args.length < 1) { System.out.println("Hdfs to Hbase Engine requires the model Id as Input ..."); System.exit(1);//from w ww . ja v a 2 s . c o m } String modelId = args[0]; DBConnection dbConnection = DBConnection.getInstance(); dbConnection.loadDbProperties(); dbConnection.initializeDataBaseConnection(); String hfdsInputLoc = dbConnection.getHdfsInputDirectory(modelId); if (hfdsInputLoc != null) { Configuration conf = new Configuration(); String params = args[0]; conf.set("test", params); Job job = new Job(conf); // Configuration conf = new Configuration(); // Job job = Job.getInstance(conf, "hfds to hbase Engine"); job.setJarByClass(HdfsToHbaseEngine.class); job.setMapperClass(HdfsEngineMapper.class); // job.setCombinerClass(test.class); // job.setReducerClass(test.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(hfdsInputLoc)); job.setOutputFormatClass(NullOutputFormat.class); // FileOutputFormat.setOutputPath(job, new Path("/output_dir")); // conf.set("argParamValue", args[2]); // System.out.println("second argument value ------" + args[0]); // System.out.println(" in main method conf.getStrings --------" + // conf.get("argParamValue")); System.exit(job.waitForCompletion(true) ? 0 : 1); } else { System.out.println(" Hdfs Input Location deos not exists .. Unable to process the Request ...."); System.exit(0); } }
From source file:com.flytxt.yesbank.test.ModelProcessor.java
License:Open Source License
public static void main(String[] args) throws Exception { String modelId = args[0];/*from ww w.java2 s.c o m*/ ModelProcessor modelProcessor = new ModelProcessor(); modelProcessor.loadDbProperties(); modelProcessor.initializeDataBaseConnection(); String hfdsInputLoc = modelProcessor.getHdfsInputDirectory(modelId); if (hfdsInputLoc != null) { Configuration conf = new Configuration(); String params = args[0]; conf.set("test", params); Job job = new Job(conf); // Configuration conf = new Configuration(); // Job job = Job.getInstance(conf, "word count"); job.setJarByClass(ModelProcessor.class); job.setMapperClass(HdfsProcessMapper.class); // job.setCombinerClass(IntSumReducer.class); // job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(hfdsInputLoc)); job.setOutputFormatClass(NullOutputFormat.class); // FileOutputFormat.setOutputPath(job, new Path("/output_dir")); // conf.set("argParamValue", args[2]); // System.out.println("second argument value ------" + args[0]); // System.out.println(" in main method conf.getStrings --------" + // conf.get("argParamValue")); System.exit(job.waitForCompletion(true) ? 0 : 1); } else { System.out.println(" Hdfs Input Location deos not exists .. Unable to process the Request ...."); System.exit(0); } }
From source file:com.github.dryangkun.hbase.tidx.hive.HBaseStorageHandler.java
License:Apache License
private void addHBaseDelegationToken(Configuration conf) throws IOException { if (User.isHBaseSecurityEnabled(conf)) { HConnection conn = HConnectionManager.createConnection(conf); try {/* w w w . j a v a 2s . c o m*/ User curUser = User.getCurrent(); Job job = new Job(conf); TokenUtil.addTokenForJob(conn, curUser, job); } catch (InterruptedException e) { throw new IOException("Error while obtaining hbase delegation token", e); } finally { conn.close(); } } }
From source file:com.github.dryangkun.hbase.tidx.hive.HBaseStorageHandler.java
License:Apache License
@Override public void configureJobConf(TableDesc tableDesc, JobConf jobConf) { try {/* ww w .j ava 2 s . com*/ HBaseSerDe.configureJobConf(tableDesc, jobConf); /* * HIVE-6356 * The following code change is only needed for hbase-0.96.0 due to HBASE-9165, and * will not be required once Hive bumps up its hbase version). At that time , we will * only need TableMapReduceUtil.addDependencyJars(jobConf) here. */ if (counterClass != null) { TableMapReduceUtil.addDependencyJars(jobConf, HBaseStorageHandler.class, TableInputFormatBase.class, counterClass); } else { TableMapReduceUtil.addDependencyJars(jobConf, HBaseStorageHandler.class, TableInputFormatBase.class); } if (HiveConf.getVar(jobConf, HiveConf.ConfVars.HIVE_HBASE_SNAPSHOT_NAME) != null) { // There is an extra dependency on MetricsRegistry for snapshot IF. TableMapReduceUtil.addDependencyJars(jobConf, MetricsRegistry.class); } Set<String> merged = new LinkedHashSet<String>(jobConf.getStringCollection("tmpjars")); Job copy = new Job(jobConf); TableMapReduceUtil.addDependencyJars(copy); merged.addAll(copy.getConfiguration().getStringCollection("tmpjars")); jobConf.set("tmpjars", StringUtils.arrayToString(merged.toArray(new String[0]))); // Get credentials using the configuration instance which has HBase properties JobConf hbaseJobConf = new JobConf(getConf()); org.apache.hadoop.hbase.mapred.TableMapReduceUtil.initCredentials(hbaseJobConf); ShimLoader.getHadoopShims().mergeCredentials(jobConf, hbaseJobConf); } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableInputFormat.java
License:Apache License
@Override public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HBaseSplit hbaseSplit = (HBaseSplit) split; TableSplit tableSplit = hbaseSplit.getTableSplit(); Job job = new Job(jobConf); TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter);// w w w . j a v a 2 s. c o m final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader; if (hbaseSplit.isTxIndexScan()) { LOG.info("getRecordReader: TxHiveIndexScan -> " + tableSplit); recordReader = TxHiveTableInputFormatUtil.createRecordReader(tableSplit, tac, jobConf); } else { LOG.info("getRecordReader: no TxHiveIndexScan -> " + tableSplit); setHTable(HiveHBaseInputFormatUtil.getTable(jobConf)); setScan(HiveHBaseInputFormatUtil.getScan(jobConf)); recordReader = createRecordReader(tableSplit, tac); } try { recordReader.initialize(tableSplit, tac); } catch (InterruptedException e) { throw new IOException("Failed to initialize RecordReader", e); } return new RecordReader<ImmutableBytesWritable, ResultWritable>() { @Override public void close() throws IOException { recordReader.close(); closeTable(); } @Override public ImmutableBytesWritable createKey() { return new ImmutableBytesWritable(); } @Override public ResultWritable createValue() { return new ResultWritable(new Result()); } @Override public long getPos() throws IOException { return 0; } @Override public float getProgress() throws IOException { float progress = 0.0F; try { progress = recordReader.getProgress(); } catch (InterruptedException e) { throw new IOException(e); } return progress; } @Override public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException { boolean next = false; try { next = recordReader.nextKeyValue(); if (next) { rowKey.set(recordReader.getCurrentValue().getRow()); value.setResult(recordReader.getCurrentValue()); } } catch (InterruptedException e) { throw new IOException(e); } return next; } }; }