List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:com.davidgildeh.hadoop.utils.FileUtils.java
License:Apache License
/** * Given a path to a valid key/value properties file on HDFS, all values will be * loaded into the provided JobConf to provide configuration properties for the * Hadoop Job/*from w ww . ja v a 2 s . c o m*/ * * @param jobConf The JobConf to load property values into * @param path The path to the properties file * @return The JobConf with loaded properties values added */ public static JobConf loadJobConf(JobConf jobConf, String path) throws IOException { Properties propFile = loadPropertiesFile(path); // Loop through all properties in properties file for (Object keyObject : propFile.keySet()) { String key = (String) keyObject; String value = propFile.getProperty(key); jobConf.set(key, value); if (LOG.isDebugEnabled()) { LOG.debug("Loaded Configuration Property " + key + ": " + value); } } return jobConf; }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); options.addOption("m", "model", true, "location of the model"); // parse the command line arguments CommandLine line = null;// w w w .j ava2 s . c o m try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); String model = line.getOptionValue("m"); if (line.hasOption("help")) { formatter.printHelp("ClassifierJob", options); return 0; } if (model == null | input == null | output == null) { formatter.printHelp("ClassifierJob", options); return -1; } } catch (ParseException e) { formatter.printHelp("ClassifierJob", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); String modelPath = line.getOptionValue("m"); JobConf job = new JobConf(getConf()); // push the model file to the DistributedCache DistributedCache.addCacheArchive(new URI(modelPath), job); job.setJarByClass(this.getClass()); job.setJobName("ClassifierJob : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TextClassifierMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.set(modelNameParam, modelPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.gate.GATEDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length < 3 | args.length > 4) { String syntax = "com.digitalpebble.behemoth.gate.GATEDriver in out path_gate_file [-XML]"; System.err.println(syntax); return -1; }// w ww .j a v a 2s . co m boolean dumpGATEXML = false; for (String arg : args) { if (arg.equalsIgnoreCase("-xml")) dumpGATEXML = true; } Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); String zip_application_path = args[2]; // check that the GATE application has been stored on HDFS Path zap = new Path(zip_application_path); if (fs.exists(zap) == false) { System.err .println("The GATE application " + zip_application_path + "can't be found on HDFS - aborting"); return -1; } JobConf job = new JobConf(getConf()); // MUST not forget the line below job.setJarByClass(this.getClass()); job.setJobName("Processing " + args[0] + " with GATE application from " + zip_application_path); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); if (dumpGATEXML) { job.setOutputValueClass(Text.class); job.setMapperClass(GATEXMLMapper.class); } else { job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(GATEMapper.class); } // detect if any filters have been defined // and activate the reducer accordingly boolean isFilterRequired = BehemothReducer.isRequired(job); if (isFilterRequired) job.setReducerClass(BehemothReducer.class); else { job.setNumReduceTasks(0); } FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // push the zipped_gate_application onto the DistributedCache DistributedCache.addCacheArchive(new URI(zip_application_path), job); job.set("gate.application.path", zip_application_path.toString()); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("GATEDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception caught", e); // leave even partial output // fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.solr.LucidWorksIndexerJob.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length != 2) { String syntax = "com.digitalpebble.solr.LucidWorksIndexerJob in solrURL"; System.err.println(syntax); return -1; }/*from w w w. ja v a2 s . co m*/ Path inputPath = new Path(args[0]); String solrURL = args[1]; JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Indexing " + inputPath + " into LucidWorks"); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(LucidWorksOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(IdentityMapper.class); // no reducer : send straight to SOLR at end of mapping job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); job.set("solr.server.url", solrURL); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("LucidWorksIndexerJob completed. Time " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error(e); } finally { fs.delete(tmp, true); } return 0; }
From source file:com.digitalpebble.behemoth.solr.SOLRIndexerJob.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length != 2) { String syntax = "com.digitalpebble.solr.SOLRIndexerJob in solrURL"; System.err.println(syntax); return -1; }/* w ww .j av a2s . co m*/ Path inputPath = new Path(args[0]); String solrURL = args[1]; JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Indexing " + inputPath + " into SOLR"); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SOLROutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(IdentityMapper.class); // no reducer : send straight to SOLR at end of mapping job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); job.set("solr.server.url", solrURL); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("SOLRIndexerJob completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error(e); } finally { fs.delete(tmp, true); } return 0; }
From source file:com.digitalpebble.behemoth.solr.TestSOLRWriter.java
License:Apache License
@Test public void testFieldMappings() throws IOException { JobConf conf = new JobConf(); conf.set("solr.server.url", "http://example.org"); conf.set("solr.f.person", "Person.string"); conf.set("solr.f.personTitle", "Person.title"); conf.set("solr.f.location", "Location"); Progressable progress = new Progressable() { @Override//from w w w . ja va 2 s .c om public void progress() { } }; SOLRWriter writer = new SOLRWriter(progress); writer.open(conf, "test"); assertEquals(writer.getFieldMapping().size(), 2); assertNotNull(writer.getFieldMapping().get("Person")); assertEquals(writer.getFieldMapping().get("Person").size(), 2); assertEquals(writer.getFieldMapping().get("Person").get("string"), "person"); assertEquals(writer.getFieldMapping().get("Person").get("title"), "personTitle"); assertNotNull(writer.getFieldMapping().get("Location")); assertEquals(writer.getFieldMapping().get("Location").size(), 1); assertEquals(writer.getFieldMapping().get("Location").get("*"), "location"); }
From source file:com.digitalpebble.behemoth.tika.TikaDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); GroupBuilder gBuilder = new GroupBuilder().withName("Options:"); List<Option> options = new ArrayList<Option>(); Option inputOpt = buildOption("input", "i", "The input path", true, true, null); options.add(inputOpt);//from w w w .ja v a2 s . c om Option outOpt = buildOption("output", "o", "The output path", true, true, null); options.add(outOpt); Option tikaOpt = buildOption("tikaProcessor", "t", "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true, false, null); options.add(tikaOpt); Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, ""); options.add(mimeTypeOpt); for (Option opt : options) { gBuilder = gBuilder.withOption(opt); } Group group = gBuilder.create(); try { Parser parser = new Parser(); parser.setGroup(group); // TODO catch exceptions with parsing of opts CommandLine cmdLine = parser.parse(args); Path inputPath = new Path(cmdLine.getValue(inputOpt).toString()); Path outputPath = new Path(cmdLine.getValue(outOpt).toString()); String handlerName = null; if (cmdLine.hasOption(tikaOpt)) { handlerName = cmdLine.getValue(tikaOpt).toString(); } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); if (cmdLine.hasOption(mimeTypeOpt)) { String mimeType = cmdLine.getValue(mimeTypeOpt).toString(); job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType); } if (handlerName != null && handlerName.equals("") == false) { job.set(TIKA_PROCESSOR_KEY, handlerName); } job.setJobName("Tika : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TikaMapper.class); boolean isFilterRequired = BehemothReducer.isRequired(job); if (isFilterRequired) job.setReducerClass(BehemothReducer.class); else { job.setNumReduceTasks(0); } FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("TikaDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { log.error("Exception", e); return -1; // don't delete the output as some of it could be used // fs.delete(outputPath, true); } finally { } } catch (OptionException e) { log.error("OptionException", e.getMessage()); HelpFormatter formatter = new HelpFormatter(); formatter.setGroup(group); formatter.print(); return -1; } return 0; }
From source file:com.digitalpebble.behemoth.uima.UIMADriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length != 3) { String syntax = "com.digitalpebble.behemoth.uima.UIMADriver in out path_pear_file"; System.err.println(syntax); return -1; }/*from w ww . j av a2s. c om*/ Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); String pearPath = args[2]; // check that the GATE application has been stored on HDFS Path zap = new Path(pearPath); if (fs.exists(zap) == false) { System.err.println("The UIMA application " + pearPath + "can't be found on HDFS - aborting"); return -1; } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Processing with UIMA application : " + pearPath); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(UIMAMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // push the UIMA pear onto the DistributedCache DistributedCache.addCacheFile(new URI(pearPath), job); job.set("uima.pear.path", pearPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("UIMADriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception", e); fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.ebay.erl.mobius.core.builder.TSVDataset.java
License:Apache License
@Override public JobConf createJobConf(byte jobSequenceNumber) throws IOException { JobConf conf = super.createJobConf(jobSequenceNumber); if (!this.delimiter.equals("\t")) { conf.set(this.getID() + ".delimiter", SerializableUtil.serializeToBase64(delimiter)); }/* www. j a va 2 s. co m*/ return conf; }
From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java
License:Apache License
/** * Set the path to the SequenceFile storing the sorted partition keyset. * It must be the case that for <tt>R</tt> reduces, there are <tt>R-1</tt> * keys in the SequenceFile./*from w w w.ja v a 2 s .co m*/ */ public static void setPartitionFile(JobConf job, Path p) { job.set("total.order.partitioner.path", p.toString()); }