List of usage examples for org.apache.hadoop.mapred JobConf setInt
public void setInt(String name, int value)
name
property to an int
. From source file:org.terrier.utility.io.HadoopUtility.java
License:Mozilla Public License
protected static Path makeTemporaryFile(JobConf jobConf, String filename) throws IOException { final int randomKey = jobConf.getInt("terrier.tempfile.id", random.nextInt()); jobConf.setInt("terrier.tempfile.id", randomKey); FileSystem defFS = FileSystem.get(jobConf); final Path tempFile = new Path(HADOOP_TMP_PATH + "/" + (randomKey) + "-" + filename); defFS.deleteOnExit(tempFile);//from ww w.j av a2 s. co m return tempFile; }
From source file:org.warcbase.index.IndexerRunner.java
License:Apache License
@SuppressWarnings("static-access") public int run(String[] args) throws IOException, ParseException { LOG.info("Initializing indexer..."); Options options = new Options(); options.addOption(//from w w w. j av a 2s. c o m OptionBuilder.withArgName("file").hasArg().withDescription("input file list").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("HDFS index output path") .create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of shards") .create(SHARDS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("config file (optional)") .create(CONFIG_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(SHARDS_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String configPath = null; if (cmdline.hasOption(CONFIG_OPTION)) { configPath = cmdline.getOptionValue(CONFIG_OPTION); } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(INDEX_OPTION); int shards = Integer.parseInt(cmdline.getOptionValue(SHARDS_OPTION)); JobConf conf = new JobConf(getConf(), IndexerRunner.class); if (configPath == null) { LOG.info("Config not specified, using default src/main/solr/WARCIndexer.conf"); configPath = "src/main/solr/WARCIndexer.conf"; } File configFile = new File(configPath); if (!configFile.exists()) { LOG.error("Error: config does not exist!"); System.exit(-1); } Config config = ConfigFactory.parseFile(configFile); conf.set(CONFIG_PROPERTIES, config.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); FileSystem fs = FileSystem.get(conf); LOG.info("HDFS index output path: " + outputPath); conf.set(IndexerReducer.HDFS_OUTPUT_PATH, outputPath); if (fs.exists(new Path(outputPath))) { LOG.error("Error: path exists already!"); System.exit(-1); } LOG.info("Number of shards: " + shards); conf.setInt(IndexerMapper.NUM_SHARDS, shards); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); conf.setJobName(IndexerRunner.class.getSimpleName() + ": " + inputPath); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(IndexerMapper.class); conf.setReducerClass(IndexerReducer.class); conf.setOutputFormat(NullOutputFormat.class); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.job.user.classpath.first", true); // Also set reduce speculative execution off, avoiding duplicate submissions to Solr. conf.setBoolean("mapreduce.reduce.speculative", false); // Note that we need this to ensure FileSystem.get is thread-safe: // @see https://issues.apache.org/jira/browse/HDFS-925 // @see https://mail-archives.apache.org/mod_mbox/hadoop-user/201208.mbox/%3CCA+4kjVt-QE2L83p85uELjWXiog25bYTKOZXdc1Ahun+oBSJYpQ@mail.gmail.com%3E conf.setBoolean("fs.hdfs.impl.disable.cache", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WritableSolrRecord.class); conf.setNumReduceTasks(shards); // number of reducers = number of shards cacheSolrHome(conf, solrHomeZipName); JobClient.runJob(conf); return 0; }
From source file:org.weikey.terasort.TeraSort.java
License:Apache License
@SuppressWarnings("deprecation") public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); SortConfig sortConfig = new SortConfig(job); // if (args.length >= 3) { // job.setNumReduceTasks(Integer.valueOf(args[2])); // if (args.length >= 4) { // sortConfig.setStartKey(Integer.valueOf(args[3])); // if (args.length >= 5) { // sortConfig.setFieldSeparator(args[4]); // }//from w ww . j a va 2 s. c o m // } // } Integer numMapTasks = null; Integer numReduceTasks = null; List<String> otherArgs = new ArrayList<String>(); boolean createLzopIndex = false; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { job.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-f".equals(args[i]) || "--ignore-case".equals(args[i])) { sortConfig.setIgnoreCase(true); } else if ("-u".equals(args[i]) || "--unique".equals(args[i])) { sortConfig.setUnique(true); } else if ("-k".equals(args[i]) || "--key".equals(args[i])) { String[] parts = StringUtils.split(args[++i], ","); sortConfig.setStartKey(Integer.valueOf(parts[0])); if (parts.length > 1) { sortConfig.setEndKey(Integer.valueOf(parts[1])); } } else if ("-t".equals(args[i]) || "--field-separator".equals(args[i])) { sortConfig.setFieldSeparator(args[++i]); } else if ("--total-order".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) { maxSplits = Integer.MAX_VALUE; } } else if ("--lzop-index".equals(args[i])) { createLzopIndex = true; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); JobClient.runJob(job); LOG.info("done"); return 0; }
From source file:pathmerge.linear.MergePathH1Driver.java
License:Apache License
public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer, int mergeRound, String defaultConfPath) throws IOException { JobConf conf = new JobConf(MergePathH1Driver.class); conf.setInt("sizeKmer", sizeKmer); if (defaultConfPath != null) { conf.addResource(new Path(defaultConfPath)); }//from w w w. ja v a2 s.c om conf.setJobName("Initial Path-Starting-Points Table"); conf.setMapperClass(SNodeInitialMapper.class); conf.setReducerClass(SNodeInitialReducer.class); conf.setMapOutputKeyClass(Kmer.class); conf.setMapOutputValueClass(MergePathValueWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); String singlePointPath = "comSinglePath0"; MultipleOutputs.addNamedOutput(conf, singlePointPath, MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); conf.setOutputKeyClass(VKmerBytesWritable.class); conf.setOutputValueClass(MergePathValueWritable.class); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(inputPath + "stepNext")); conf.setNumReduceTasks(numReducers); FileSystem dfs = FileSystem.get(conf); dfs.delete(new Path(inputPath + "stepNext"), true); JobClient.runJob(conf); dfs.rename(new Path(inputPath + "stepNext" + "/" + singlePointPath), new Path(mergeResultPath + "/" + singlePointPath)); int iMerge = 0; /*----------------------------------------------------------------------*/ for (iMerge = 1; iMerge <= mergeRound; iMerge++) { // if (!dfs.exists(new Path(inputPath + "-step1"))) // break; conf = new JobConf(MergePathH1Driver.class); conf.setInt("sizeKmer", sizeKmer); conf.setInt("iMerge", iMerge); if (defaultConfPath != null) { conf.addResource(new Path(defaultConfPath)); } conf.setJobName("Path Merge"); conf.setMapperClass(MergePathH1Mapper.class); conf.setReducerClass(MergePathH1Reducer.class); conf.setMapOutputKeyClass(VKmerBytesWritable.class); conf.setMapOutputValueClass(MergePathValueWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); String uncompSinglePath = "uncompSinglePath" + iMerge; String comSinglePath = "comSinglePath" + iMerge; String comCircle = "comCircle" + iMerge; MultipleOutputs.addNamedOutput(conf, uncompSinglePath, MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); MultipleOutputs.addNamedOutput(conf, comSinglePath, MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); MultipleOutputs.addNamedOutput(conf, comCircle, MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); conf.setOutputKeyClass(VKmerBytesWritable.class); conf.setOutputValueClass(MergePathValueWritable.class); FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext")); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setNumReduceTasks(numReducers); dfs.delete(new Path(outputPath), true); JobClient.runJob(conf); dfs.delete(new Path(inputPath + "stepNext"), true); dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext")); dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath)); dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle)); } }
From source file:pathmerge.log.MergePathH2Driver.java
License:Apache License
public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer, int mergeRound, String defaultConfPath) throws IOException { JobConf conf = new JobConf(MergePathH2Driver.class); conf.setInt("sizeKmer", sizeKmer); if (defaultConfPath != null) { conf.addResource(new Path(defaultConfPath)); }//from ww w. j a v a2s.com conf.setJobName("Initial Path-Starting-Points Table"); conf.setMapperClass(SNodeInitialMapper.class); conf.setReducerClass(SNodeInitialReducer.class); conf.setMapOutputKeyClass(Kmer.class); conf.setMapOutputValueClass(MergePathValueWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); String singlePointPath = "comSinglePath0"; MultipleOutputs.addNamedOutput(conf, singlePointPath, MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); conf.setOutputKeyClass(VKmerBytesWritable.class); conf.setOutputValueClass(MergePathValueWritable.class); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(inputPath + "stepNext")); conf.setNumReduceTasks(numReducers); FileSystem dfs = FileSystem.get(conf); dfs.delete(new Path(inputPath + "stepNext"), true); JobClient.runJob(conf); dfs.rename(new Path(inputPath + "stepNext" + "/" + singlePointPath), new Path(mergeResultPath + "/" + singlePointPath)); int iMerge = 0; for (iMerge = 1; iMerge <= mergeRound; iMerge++) { // if (!dfs.exists(new Path(inputPath + "-step1"))) // break; conf = new JobConf(MergePathH2Driver.class); conf.setInt("sizeKmer", sizeKmer); conf.setInt("iMerge", iMerge); if (defaultConfPath != null) { conf.addResource(new Path(defaultConfPath)); } conf.setJobName("Path Merge"); conf.setMapperClass(MergePathH2Mapper.class); conf.setReducerClass(MergePathH2Reducer.class); conf.setMapOutputKeyClass(VKmerBytesWritable.class); conf.setMapOutputValueClass(MergePathValueWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); String uncompSinglePath = "uncompSinglePath" + iMerge; String comSinglePath = "comSinglePath" + iMerge; String comCircle = "comCircle" + iMerge; MultipleOutputs.addNamedOutput(conf, uncompSinglePath, MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); MultipleOutputs.addNamedOutput(conf, comSinglePath, MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); MultipleOutputs.addNamedOutput(conf, comCircle, MergePathMultiSeqOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); conf.setOutputKeyClass(VKmerBytesWritable.class); conf.setOutputValueClass(MergePathValueWritable.class); FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext")); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setNumReduceTasks(numReducers); dfs.delete(new Path(outputPath), true); JobClient.runJob(conf); dfs.delete(new Path(inputPath + "stepNext"), true); dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext")); dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath)); dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle)); } /* conf = new JobConf(MergePathH2Driver.class); conf.setInt("sizeKmer", sizeKmer); conf.setInt("iMerge", iMerge); if (defaultConfPath != null) { conf.addResource(new Path(defaultConfPath)); } conf.setJobName("Path Merge"); conf.setMapperClass(MergePathH2Mapper.class); conf.setReducerClass(MergePathH2Reducer.class); conf.setMapOutputKeyClass(VKmerBytesWritable.class); conf.setMapOutputValueClass(MergePathValueWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); String uncompSinglePath = "uncompSinglePath" + iMerge; String comSinglePath = "comSinglePath" + iMerge; String comCircle = "comCircle" + iMerge; MultipleOutputs.addNamedOutput(conf, uncompSinglePath, MergePathMultiTextOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); MultipleOutputs.addNamedOutput(conf, comSinglePath, MergePathMultiTextOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); MultipleOutputs.addNamedOutput(conf, comCircle, MergePathMultiTextOutputFormat.class, VKmerBytesWritable.class, MergePathValueWritable.class); conf.setOutputKeyClass(VKmerBytesWritable.class); conf.setOutputValueClass(MergePathValueWritable.class); FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext")); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setNumReduceTasks(numReducers); dfs.delete(new Path(outputPath), true); JobClient.runJob(conf); dfs.delete(new Path(inputPath + "stepNext"), true); dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext")); dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath)); dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle));*/ }
From source file:redpoll.examples.sogou.SogouTermDriver.java
License:Apache License
public static void runJob(String input, String output, String analyzerName, int dfLimit) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(SogouTermDriver.class); FileSystem fs = FileSystem.get(conf); Path outPath = new Path(output); if (fs.exists(outPath)) { fs.delete(outPath, true);//w w w . j a va2 s. c o m } conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(TermWritable.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, outPath); conf.set("redpoll.text.analyzer", analyzerName); conf.setInt("redpoll.text.df.limit", dfLimit); conf.setMapperClass(TermMapper.class); conf.setReducerClass(TermReducer.class); conf.setInputFormat(SogouInputFormat.class); conf.setOutputFormat(TermOutputFormat.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (IOException e) { LOG.error(e.toString()); } }
From source file:redpoll.text.TfIdfDriver.java
License:Apache License
/** * Run the job/*w w w .j a v a2 s . c om*/ * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(TfIdfDriver.class); FileSystem fs = FileSystem.get(conf); Path inPath = new Path(input + "/tf"); FileInputFormat.setInputPaths(conf, inPath); Path outPath = new Path(output); FileOutputFormat.setOutputPath(conf, outPath); conf.setMapperClass(TfIdfMapper.class); conf.setReducerClass(TfIdfReducer.class); //conf.setNumMapTasks(10); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(TfIdfWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TfIdfOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); // serialize a term hashmap. Its key is the term , value is a term index of // the term vector. Path dfpath = new Path(input + "/df/part-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dfpath, conf); Text key = new Text(); IntWritable value = new IntWritable(); HashMap<String, Integer> termMap = new HashMap<String, Integer>(); int index = 0; while ((reader.next(key, value))) { String termString = key.toString(); if (!termString.equals("redpoll.docs.num")) { termMap.put(key.toString(), index); index++; } else { conf.setInt("redpoll.docs.num", value.get()); } } reader.close(); DefaultStringifier<HashMap<String, Integer>> mapStringifier = new DefaultStringifier<HashMap<String, Integer>>( conf, GenericsUtil.getClass(termMap)); String termMapString = mapStringifier.toString(termMap); conf.setInt("redpoll.text.terms.num", index); // number of terms conf.set("redpoll.text.terms", termMapString); client.setConf(conf); JobClient.runJob(conf); }
From source file:source.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", getOutputReplication(job)); TeraOutputFormat.setFinalSync(job, true); JobClient.runJob(job);/*from w ww. jav a2 s . co m*/ LOG.info("done"); return 0; }
From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java
License:Open Source License
@SuppressWarnings("deprecation") @Test/*ww w.j av a 2s.c om*/ public void testMDXGenerator() throws Exception { // prepare for test // createTextInputFile(); log.info("Checking input file is present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths( getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter())); Assert.assertEquals(2, inputFiles.length); // Create a file of the inputs File tmpInputsFile = writeInputFile(inputFiles); // Set up arguments for the job: String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() }; // Set up the WARCIndexerRunner WARCMDXGenerator wir = new WARCMDXGenerator(); // run job // Job configuration: log.info("Setting up job config..."); JobConf jobConf = this.mrCluster.createJobConf(); jobConf.setInt(WARCMDXGenerator.WARC_HADOOP_NUM_REDUCERS, 1); jobConf.set("mapred.child.java.opts", "-Xmx512m"); wir.createJobConf(jobConf, args); log.info("Running job..."); JobClient.runJob(jobConf); log.info("Job finished, checking the results..."); // check the output exists Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter())); // Default is 1 reducers (as knitting together multiple sequence files // is not a mere matter of concatentation): Assert.assertEquals(1, outputFiles.length); // Copy the output out of HDFS and onto local FS: FileOutputStream fout = new FileOutputStream(outputSeq); for (Path output : outputFiles) { log.info(" --- output : " + output); if (getFileSystem().isFile(output)) { InputStream is = getFileSystem().open(output); IOUtils.copy(is, fout); } else { log.info(" --- ...skipping directory..."); } fout.flush(); } fout.close(); // Check contents of the output: Configuration config = new Configuration(); Path path = new Path(outputSeq.getAbsolutePath()); SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(config), path, config); WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance(); Writable value = (Writable) reader.getValueClass().newInstance(); MDX mdx; int counter = 0; while (reader.next(key, value)) { mdx = new MDX(value.toString()); System.out.println( "Key is: " + key + " record_type: " + mdx.getRecordType() + " SURT: " + mdx.getUrlAsSURT()); counter++; } assertEquals(114, counter); reader.close(); // Now test the MDXSeqMerger testSeqMerger(outputFiles); }
From source file:voldemort.store.readonly.benchmark.GenerateData.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 3) Utils.croak("USAGE: GenerateData input-file output-dir value-size"); JobConf conf = new JobConf(getConf(), GenerateData.class); conf.setJobName("generate-data"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(GenerateDataMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumReduceTasks(0);/*from www. ja va 2 s. c o m*/ conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); Path inputPath = new Path(args[0]); FileInputFormat.setInputPaths(conf, inputPath); Path outputPath = new Path(args[1]); // delete output path if it already exists FileSystem fs = outputPath.getFileSystem(conf); if (fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInt("value.size", Integer.parseInt(args[2])); JobClient.runJob(conf); return 0; }