List of usage examples for org.apache.hadoop.mapreduce.lib.input NLineInputFormat setNumLinesPerSplit
public static void setNumLinesPerSplit(Job job, int numLines)
From source file:boostingPL.driver.AdaBoostPLDriver.java
License:Open Source License
@Override public int run(String[] args) throws Exception { int status = commandAnalysis(args); if (status != 0) { return status; }//from w w w .ja v a2 s. co m @SuppressWarnings("deprecation") Job job = new Job(getConf()); job.setJobName("AdaBoostPL:" + runModel + " " + dataPath.toString() + " " + modelPath.toString() + " " + numLinesPerMap + " " + numIterations); job.setJarByClass(AdaBoostPLDriver.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, dataPath); NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap); if (runModel.equals("train")) { job.setMapperClass(AdaBoostPLMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ClassifierWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(ClassifierWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, modelPath); } else { job.setMapperClass(AdaBoostPLTestMapper.class); job.setReducerClass(AdaBoostPLTestReducer.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); } Configuration conf = job.getConfiguration(); conf.set("BoostingPL.boostingName", "AdaBoost"); conf.set("BoostingPL.numIterations", String.valueOf(numIterations)); conf.set("BoostingPL.modelPath", modelPath.toString()); if (metadataPath == null) { conf.set("BoostingPL.metadata", dataPath.toString() + ".metadata"); } else { conf.set("BoostingPL.metadata", metadataPath.toString()); } if (outputFolder != null) { conf.set("BoostingPL.outputFolder", outputFolder.toString()); } LOG.info(StringUtils.arrayToString(args)); return job.waitForCompletion(true) == true ? 0 : -1; }
From source file:boostingPL.driver.SAMMEPLDriver.java
License:Open Source License
@Override public int run(String[] args) throws Exception { int status = commandAnalysis(args); if (status != 0) { return status; }//from ww w. ja va2 s. c o m @SuppressWarnings("deprecation") Job job = new Job(getConf()); job.setJobName("SAMMEPL:" + runModel + " " + dataPath.toString() + " " + modelPath.toString() + " " + numLinesPerMap + " " + numIterations); job.setJarByClass(SAMMEPLDriver.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, dataPath); NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap); FileSystem fs = modelPath.getFileSystem(getConf()); if (fs.exists(modelPath)) { fs.delete(modelPath, true); } job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, modelPath); if (runModel.equals("train")) { job.setMapperClass(AdaBoostPLMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ClassifierWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(ClassifierWritable.class); } else { job.setMapperClass(AdaBoostPLTestMapper.class); job.setReducerClass(AdaBoostPLTestReducer.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); } Configuration conf = job.getConfiguration(); conf.set("BoostingPL.boostingName", "SAMME"); conf.set("BoostingPL.numIterations", String.valueOf(numIterations)); conf.set("BoostingPL.modelPath", modelPath.toString()); if (metadataPath == null) { conf.set("BoostingPL.metadata", dataPath.toString() + ".metadata"); } else { conf.set("BoostingPL.metadata", metadataPath.toString()); } if (outputFolder != null) { conf.set("BoostingPL.outputFolder", outputFolder.toString()); } LOG.info(StringUtils.arrayToString(args)); return job.waitForCompletion(true) == true ? 0 : -1; }
From source file:com.msd.gin.halyard.tools.HalyardBulkUpdate.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkupdate [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_file_with_SPARQL_queries> <output_path> <table_name>"); return -1; }//w w w . java 2 s . c om TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); getConf().setStrings(TABLE_NAME_PROPERTY, args[2]); Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + args[1] + " -> " + args[2]); NLineInputFormat.setNumLinesPerSplit(job, 1); job.setJarByClass(HalyardBulkUpdate.class); job.setMapperClass(SPARQLMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(NLineInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], false, 0, null)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Update Completed.."); return 0; } } return -1; }
From source file:eu.edisonproject.classification.tfidf.mapreduce.TermWordFrequency.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration jobconf = getConf(); Job job = Job.getInstance(jobconf);/* www. j a va2s . c o m*/ FileSystem fs = FileSystem.get(jobconf); fs.delete(new Path(args[1]), true); Path dictionary = new Path(args[0]); Path dictionaryHdfs = dictionary; Path localDocs = new Path(args[2]); Path hdfsDocs = localDocs; Path stopwordsLocal = new Path(args[3]); Path stopwordsHDFS = stopwordsLocal; if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { dictionaryHdfs = new Path(dictionary.getName()); if (!fs.exists(dictionaryHdfs)) { fs.copyFromLocalFile(dictionary, dictionaryHdfs); } hdfsDocs = new Path(localDocs.getName()); fs.mkdirs(hdfsDocs); fs.deleteOnExit(hdfsDocs); File[] stats = new File(localDocs.toString()).listFiles(); for (File stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) { Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } stopwordsHDFS = new Path(stopwordsLocal.getName()); if (!fs.exists(stopwordsHDFS)) { fs.copyFromLocalFile(stopwordsLocal, stopwordsHDFS); } } FileStatus stopwordsStatus = fs.getFileStatus(stopwordsHDFS); stopwordsHDFS = stopwordsStatus.getPath(); job.addCacheFile(stopwordsHDFS.toUri()); job.addCacheFile(hdfsDocs.toUri()); job.setJarByClass(TermWordFrequency.class); job.setJobName("Word Frequency Term Driver"); FileInputFormat.setInputPaths(job, dictionaryHdfs); FileOutputFormat.setOutputPath(job, new Path(args[1])); // job.setInputFormatClass(TextInputFormat.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, dictionaryHdfs); NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4])); NLineInputFormat.setMaxInputSplitSize(job, 500); job.setMapperClass(TermWordFrequencyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(TermWordFrequencyReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.training.tfidf.mapreduce.TermWordFrequency.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration jobconf = getConf(); FileSystem fs = FileSystem.get(jobconf); fs.delete(new Path(args[1]), true); Path in = new Path(args[0]); Path inHdfs = in;//ww w . j av a 2 s .c om if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { inHdfs = new Path(in.getName()); fs.delete(inHdfs, true); fs.copyFromLocalFile(in, inHdfs); fs.deleteOnExit(inHdfs); FileStatus inHdfsStatus = fs.getFileStatus(inHdfs); // Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Copied: {0} to: {1}", new Object[]{in.toUri(), inHdfsStatus.getPath().toUri()}); } Job job = Job.getInstance(jobconf); Path stopwordsLocal = new Path(args[3]); stopwords = new Path(stopwordsLocal.getName()); fs.delete(stopwords, true); fs.copyFromLocalFile(stopwordsLocal, stopwords); fs.deleteOnExit(stopwords); FileStatus stopwordsStatus = fs.getFileStatus(stopwords); stopwords = stopwordsStatus.getPath(); job.addCacheFile(stopwords.toUri()); Path localDocs = new Path(args[2]); Path hdfsDocs = new Path(localDocs.getName()); fs.mkdirs(hdfsDocs); hdfsDocs = fs.getFileStatus(hdfsDocs).getPath(); fs.delete(hdfsDocs, true); // FileStatus[] stats = fs.listStatus(localDocs); File[] stats = new File(localDocs.toString()).listFiles(); for (File stat : stats) { // for (FileStatus stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) { Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } job.addCacheFile(hdfsDocs.toUri()); job.setJarByClass(TermWordFrequency.class); job.setJobName("Word Frequency Term Driver"); FileInputFormat.setInputPaths(job, inHdfs); FileOutputFormat.setOutputPath(job, new Path(args[1])); // job.setInputFormatClass(TextInputFormat.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, inHdfs); NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4])); NLineInputFormat.setMaxInputSplitSize(job, 500); Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Num. of lines: {0}", NLineInputFormat.getNumLinesPerSplit(job)); job.setMapperClass(TermWordFrequencyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(TermWordFrequencyReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:io.apigee.lembos.node.types.NLineInputFormatWrap.java
License:Apache License
/** * Java wrapper for {@link NLineInputFormat#setNumLinesPerSplit(org.apache.hadoop.mapreduce.Job, int)}. * * @param ctx the JavaScript context//from w w w . j a va 2 s . c o m * @param thisObj the 'this' object * @param args the function arguments * @param func the function called (unused) */ @JSStaticFunction public static void setNumLinesPerSplit(final Context ctx, final Scriptable thisObj, final Object[] args, final Function func) { final Object arg0 = args.length >= 1 ? args[0] : Undefined.instance; final Object arg1 = args.length >= 2 ? args[1] : Undefined.instance; if (args.length < 2) { throw Utils.makeError(ctx, thisObj, LembosMessages.TWO_ARGS_EXPECTED); } else if (!JavaScriptUtils.isDefined(arg0)) { throw Utils.makeError(ctx, thisObj, LembosMessages.FIRST_ARG_REQUIRED); } else if (!JavaScriptUtils.isDefined(arg1)) { throw Utils.makeError(ctx, thisObj, LembosMessages.SECOND_ARG_REQUIRED); } else if (!(arg0 instanceof JobWrap)) { throw Utils.makeError(ctx, thisObj, LembosMessages.FIRST_ARG_MUST_BE_JOB); } else if (!(arg1 instanceof Number)) { throw Utils.makeError(ctx, thisObj, LembosMessages.SECOND_ARG_ARG_MUST_BE_NUM); } NLineInputFormat.setNumLinesPerSplit(((JobWrap) arg0).getJob(), JavaScriptUtils.fromNumber(arg1).intValue()); }
From source file:org.apache.accumulo.test.mrit.IntegrationTestMapReduce.java
License:Apache License
@Override public int run(String[] args) throws Exception { // read a list of tests from the input, and print out the results if (args.length != 2) { System.err.println("Wrong number of args: <input> <output>"); return 1; }/*from ww w . j ava 2 s . com*/ Configuration conf = getConf(); Job job = Job.getInstance(conf, "accumulo integration test runner"); conf = job.getConfiguration(); // some tests take more than 10 minutes conf.setLong(MRJobConfig.TASK_TIMEOUT, 20 * 60 * 1000); // minicluster uses a lot of ram conf.setInt(MRJobConfig.MAP_MEMORY_MB, 4000); // hadoop puts an ancient version of jline on the classpath conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true); // no need to run a test multiple times job.setSpeculativeExecution(false); // read one line at a time job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.setNumLinesPerSplit(job, 1); // run the test job.setJarByClass(IntegrationTestMapReduce.class); job.setMapperClass(TestMapper.class); // group test by result code job.setReducerClass(TestReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
/** * Runs a test with a single input/*from w w w. j a va 2 s.c o m*/ * * @param config * Configuration * @param input * Input * @param expectedTuples * Expected tuples * @throws IOException * @throws InterruptedException */ protected final void testSingleInput(Configuration config, File input, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); this.addInputPath(input, job.getConfiguration(), job); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(1, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, LARGE_SIZE); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); this.checkTuples(reader, expectedTuples); } }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
/** * Runs a multiple input test/*w w w .j a v a 2 s .co m*/ * * @param inputs * Inputs * @param expectedSplits * Number of splits expected * @param expectedTuples * Number of tuples expected * @throws IOException * @throws InterruptedException */ protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Prepare configuration and inputs Configuration config = this.prepareConfiguration(); // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); } JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, expectedTuples); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes with the same identity in a single * file, splits them over two files and checks that we can workaround * JENA-820 successfully by setting the//from ww w. j av a2 s . c o m * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job * * @throws IOException * @throws InterruptedException */ @Test public final void blank_node_divergence_01() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_divergence", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile(); try { // Prepare the input data // Two mentions of the same blank node in the same file List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(a, tuples); // Set up fake job which will process the file as a single split Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - each triple goes to a separate // output file // This is how we force multiple files to be produced int taskID = 1; while (reader.nextKeyValue()) { // Prepare the output writing OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); writer.write(reader.getCurrentKey(), reader.getCurrentValue()); writer.close(outputTaskContext); } } // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // As described in JENA-820 at this point the blank nodes are // consistent, however when we read them from different files they // by default get treated as different nodes and so the blank nodes // diverge which is incorrect and undesirable behaviour in // multi-stage pipelines LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); // Enabling this flag works around the JENA-820 issue job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes should not have diverged Assert.assertEquals(1, nodes.size()); } finally { a.delete(); deleteDirectory(intermediateOutputDir); } }