List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass)
From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopV2JobSelfTest.java
License:Apache License
/** * Tests that {@link GridHadoopJob} provides wrapped serializer if it's set in configuration. * * @throws IgniteCheckedException If fails. *///from w w w . java 2s .co m public void testCustomSerializationApplying() throws IgniteCheckedException { JobConf cfg = new JobConf(); cfg.setMapOutputKeyClass(IntWritable.class); cfg.setMapOutputValueClass(Text.class); cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); GridHadoopJob job = new GridHadoopV2Job(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log); GridHadoopTaskContext taskCtx = job .getTaskContext(new GridHadoopTaskInfo(GridHadoopTaskType.MAP, null, 0, 0, null)); GridHadoopSerialization ser = taskCtx.keySerialization(); assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName()); DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0])); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); ser = taskCtx.valueSerialization(); assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName()); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); }
From source file:org.apache.ignite.internal.processors.hadoop.HadoopV2JobSelfTest.java
License:Apache License
/** * Tests that {@link HadoopJob} provides wrapped serializer if it's set in configuration. * * @throws IgniteCheckedException If fails. *//*w w w . j a v a2s . c o m*/ public void testCustomSerializationApplying() throws IgniteCheckedException { JobConf cfg = new JobConf(); cfg.setMapOutputKeyClass(IntWritable.class); cfg.setMapOutputValueClass(Text.class); cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); HadoopJob job = new HadoopV2Job(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log); HadoopTaskContext taskCtx = job.getTaskContext(new HadoopTaskInfo(HadoopTaskType.MAP, null, 0, 0, null)); HadoopSerialization ser = taskCtx.keySerialization(); assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName()); DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0])); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); ser = taskCtx.valueSerialization(); assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName()); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); }
From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopV2JobSelfTest.java
License:Apache License
/** * Tests that {@link HadoopJobEx} provides wrapped serializer if it's set in configuration. * * @throws IgniteCheckedException If fails. *//*from w w w.ja v a2s. co m*/ public void testCustomSerializationApplying() throws IgniteCheckedException { JobConf cfg = new JobConf(); cfg.setMapOutputKeyClass(IntWritable.class); cfg.setMapOutputValueClass(Text.class); cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); HadoopDefaultJobInfo info = createJobInfo(cfg); final UUID uuid = UUID.randomUUID(); HadoopJobId id = new HadoopJobId(uuid, 1); HadoopJobEx job = info.createJob(HadoopV2Job.class, id, log, null, new HadoopHelperImpl()); HadoopTaskContext taskCtx = job.getTaskContext(new HadoopTaskInfo(HadoopTaskType.MAP, null, 0, 0, null)); HadoopSerialization ser = taskCtx.keySerialization(); assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName()); DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0])); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); ser = taskCtx.valueSerialization(); assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName()); assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString()); }
From source file:org.apache.mahout.math.hadoop.MatrixMultiplicationJob.java
License:Apache License
public static Configuration createMatrixMultiplyJobConf(Configuration initialConf, Path aPath, Path bPath, Path outPath, int outCardinality) { JobConf conf = new JobConf(initialConf, MatrixMultiplicationJob.class); conf.setInputFormat(CompositeInputFormat.class); conf.set("mapred.join.expr", CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath)); conf.setInt(OUT_CARD, outCardinality); conf.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(conf, outPath); conf.setMapperClass(MatrixMultiplyMapper.class); conf.setCombinerClass(MatrixMultiplicationReducer.class); conf.setReducerClass(MatrixMultiplicationReducer.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(VectorWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(VectorWritable.class); return conf;//from w w w .j a va2 s . c o m }
From source file:org.apache.nutch.indexer.DeleteDuplicates.java
License:Apache License
public void dedup(Path[] indexDirs) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }//from w w w. jav a2 s . com Path outDir1 = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(getConf()); for (int i = 0; i < indexDirs.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("Dedup: adding indexes in: " + indexDirs[i]); } job.addInputPath(indexDirs[i]); } job.setJobName("dedup 1: urls by time"); job.setInputFormat(InputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IndexDoc.class); job.setReducerClass(UrlsReducer.class); job.setOutputPath(outDir1); job.setOutputKeyClass(MD5Hash.class); job.setOutputValueClass(IndexDoc.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); Path outDir2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); job = new NutchJob(getConf()); job.setJobName("dedup 2: content by hash"); job.addInputPath(outDir1); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(MD5Hash.class); job.setMapOutputValueClass(IndexDoc.class); job.setPartitionerClass(HashPartitioner.class); job.setSpeculativeExecution(false); job.setReducerClass(HashReducer.class); job.setOutputPath(outDir2); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IndexDoc.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); // remove outDir1 - no longer needed fs.delete(outDir1); job = new NutchJob(getConf()); job.setJobName("dedup 3: delete from index(es)"); job.addInputPath(outDir2); job.setInputFormat(SequenceFileInputFormat.class); //job.setInputKeyClass(Text.class); //job.setInputValueClass(IndexDoc.class); job.setInt("io.file.buffer.size", 4096); job.setMapperClass(DeleteDuplicates.class); job.setReducerClass(DeleteDuplicates.class); job.setOutputFormat(DeleteDuplicates.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); fs.delete(outDir2); if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); } }
From source file:org.apache.nutch.indexer.field.AnchorFields.java
License:Apache License
/** * Runs the Extractor job. Get outlinks to be converted while ignoring empty * and null anchors.//www . java 2s . c o m * * @param webGraphDb The WebGraphDb to pull from. * @param output The extractor output. * * @throws IOException If an error occurs while running the extractor. */ private void runExtractor(Path webGraphDb, Path output) throws IOException { JobConf extractor = new NutchJob(getConf()); extractor.setJobName("AnchorFields Extractor"); FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.OUTLINK_DIR)); FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.NODE_DIR)); FileOutputFormat.setOutputPath(extractor, output); extractor.setInputFormat(SequenceFileInputFormat.class); extractor.setMapperClass(Extractor.class); extractor.setReducerClass(Extractor.class); extractor.setMapOutputKeyClass(Text.class); extractor.setMapOutputValueClass(ObjectWritable.class); extractor.setOutputKeyClass(Text.class); extractor.setOutputValueClass(LinkDatum.class); extractor.setOutputFormat(SequenceFileOutputFormat.class); LOG.info("Starting extractor job"); try { JobClient.runJob(extractor); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished extractor job."); }
From source file:org.apache.nutch.indexer.field.AnchorFields.java
License:Apache License
/** * Runs the collector job. Aggregates extracted inlinks, sorts and converts * the highest scoring into FieldWritable objects. Only inlinks for which * basic fields exist will be collected to avoid orphan fields. * /* w w w.j av a2 s .c o m*/ * @param basicFields The BasicFields which must be present to collect anchors * to avoid orphan fields. * @param links The outlinks path. * @param output The collector output. * * @throws IOException If an error occurs while running the collector. */ private void runCollector(Path basicFields, Path links, Path output) throws IOException { JobConf collector = new NutchJob(getConf()); collector.setJobName("AnchorFields Collector"); FileInputFormat.addInputPath(collector, links); FileInputFormat.addInputPath(collector, basicFields); FileOutputFormat.setOutputPath(collector, output); collector.setInputFormat(SequenceFileInputFormat.class); collector.setMapOutputKeyClass(Text.class); collector.setMapOutputValueClass(ObjectWritable.class); collector.setMapperClass(Collector.class); collector.setReducerClass(Collector.class); collector.setOutputKeyClass(Text.class); collector.setOutputValueClass(FieldWritable.class); collector.setOutputFormat(SequenceFileOutputFormat.class); LOG.info("Starting collector job"); try { JobClient.runJob(collector); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished collector job."); }
From source file:org.apache.nutch.indexer.field.BasicFields.java
License:Apache License
/** * Runs the Extractor job. Extracts basic fields from segments. * /*from w ww.j av a2 s .c o m*/ * @param nodeDb The node database * @param segment A single segment to process. * @param outputDir The extractor output. * * @throws IOException If an error occurs while processing the segment. */ private void runExtractor(Path nodeDb, Path segment, Path outputDir) throws IOException { LOG.info("BasicFields: starting extractor"); JobConf job = new NutchJob(getConf()); job.setJobName("BasicFields " + outputDir); LOG.info("BasicFields: extractor adding segment: " + segment); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); FileInputFormat.addInputPath(job, nodeDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Extractor.class); job.setReducerClass(Extractor.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ObjectWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FieldsWritable.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("BasicFields: finished extractor"); } }
From source file:org.apache.nutch.indexer.field.BasicFields.java
License:Apache License
/** * Runs the Flipper job. Flipper is the first of a two part job to implement * redirect logic./*from w w w. j a va2 s. com*/ * * @param basicFields The basic fields temporary output. * @param nodeDb The node database. * @param outputDir The flipper output. * * @throws IOException If an error occurs while processing. */ private void runFlipper(Path basicFields, Path nodeDb, Path outputDir) throws IOException { LOG.info("BasicFields: starting flipper"); JobConf job = new NutchJob(getConf()); job.setJobName("BasicFields " + outputDir); FileInputFormat.addInputPath(job, nodeDb); FileInputFormat.addInputPath(job, basicFields); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Flipper.class); job.setReducerClass(Flipper.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ObjectWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LinkDatum.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("BasicFields: finished flipper"); } }
From source file:org.apache.nutch.indexer.field.BasicFields.java
License:Apache License
/** * Runs the Scorer job. Scorer is the second of a two part job to implement * redirect logic.//from w w w .j av a 2 s . c o m * * @param basicFields The basic fields temporary output. * @param links The temporary output holding urls and any redirects. * @param outputDir The scorer output. * * @throws IOException If an error occurs while processing. */ private void runScorer(Path basicFields, Path links, Path outputDir) throws IOException { LOG.info("BasicFields: starting scorer"); JobConf job = new NutchJob(getConf()); job.setJobName("BasicFields " + outputDir); FileInputFormat.addInputPath(job, links); FileInputFormat.addInputPath(job, basicFields); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Scorer.class); job.setReducerClass(Scorer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ObjectWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FieldsWritable.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("BasicFields: finished scorer"); } }