List of usage examples for org.apache.hadoop.fs Path Path
public Path(URI aUri)
From source file:ReadAllTest.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.out.println("ReadAllTest: must supply the HDFS uri and file to read"); System.exit(1);/*from w w w . j a va 2s .c o m*/ } String hdfsUri = args[0]; String fileName = args[1]; final Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(new URI(hdfsUri), conf); byte ORIGINAL[] = new byte[10]; for (int i = 0; i < ORIGINAL.length; i++) { ORIGINAL[i] = (byte) i; } FSDataOutputStream out = fs.create(new Path(fileName), (short) 1); try { out.write(ORIGINAL); } finally { out.close(); } byte input[] = new byte[ORIGINAL.length]; FSDataInputStream in = fs.open(new Path(fileName)); try { in.readFully(input); } finally { in.close(); } in = fs.open(new Path(fileName)); try { in.readFully(0, input); } finally { in.close(); } }
From source file:MRDriver.java
License:Apache License
public int run(String args[]) throws Exception { FileSystem fs = null;// www . j a v a2 s . c om Path samplesMapPath = null; float epsilon = Float.parseFloat(args[0]); double delta = Double.parseDouble(args[1]); int minFreqPercent = Integer.parseInt(args[2]); int d = Integer.parseInt(args[3]); int datasetSize = Integer.parseInt(args[4]); int numSamples = Integer.parseInt(args[5]); double phi = Double.parseDouble(args[6]); Random rand; /************************ Job 1 (local FIM) Configuration ************************/ JobConf conf = new JobConf(getConf()); /* * Compute the number of required "votes" for an itemsets to be * declared frequent */ // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2. int reqApproxNum = (int) Math .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1; int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi))); //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum); conf.setInt("PARMM.reducersNum", numSamples); conf.setInt("PARMM.datasetSize", datasetSize); conf.setInt("PARMM.minFreqPercent", minFreqPercent); conf.setInt("PARMM.sampleSize", sampleSize); conf.setFloat("PARMM.epsilon", epsilon); // Set the number of reducers equal to the number of samples, to // maximize parallelism. Required by our Partitioner. conf.setNumReduceTasks(numSamples); // XXX: why do we disable the speculative execution? MR conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI); /* * Enable compression of map output. * * We do it for this job and not for the aggregation one because * each mapper there only print out one record for each itemset, * so there isn't much to compress, I'd say. MR * * In Amazon MapReduce compression of the map output seems to be * happen by default and the Snappy codec is used, which is * extremely fast. */ conf.setBoolean("mapred.compress.map.output", true); //conf.setMapOutputCompressorClass(com.hadoop.compression.lzo.LzoCodec.class); conf.setJarByClass(MRDriver.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); // We write the collections found in a reducers as a SequenceFile conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9])); // set the mapper class based on command line option switch (Integer.parseInt(args[7])) { case 1: System.out.println("running partition mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(PartitionMapper.class); break; case 2: System.out.println("running binomial mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(BinomialSamplerMapper.class); break; case 3: System.out.println("running coin mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(CoinFlipSamplerMapper.class); case 4: System.out.println("running sampler mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(InputSamplerMapper.class); // create a random sample of size T*m rand = new Random(); long sampling_start_time = System.nanoTime(); int[] samples = new int[numSamples * sampleSize]; for (int i = 0; i < numSamples * sampleSize; i++) { samples[i] = rand.nextInt(datasetSize); } // for each key in the sample, create a list of all T samples to which this key belongs Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>(); for (int i = 0; i < numSamples * sampleSize; i++) { ArrayList<IntWritable> sampleIDs = null; LongWritable key = new LongWritable(samples[i]); if (hashTable.containsKey(key)) sampleIDs = hashTable.get(key); else sampleIDs = new ArrayList<IntWritable>(); sampleIDs.add(new IntWritable(i % numSamples)); hashTable.put(key, sampleIDs); } /* * Convert the Hastable to a MapWritable which we will * write to HDFS and distribute to all Mappers using * DistributedCache */ MapWritable map = new MapWritable(); for (LongWritable key : hashTable.keySet()) { ArrayList<IntWritable> sampleIDs = hashTable.get(key); IntArrayWritable sampleIDsIAW = new IntArrayWritable(); sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()])); map.put(key, sampleIDsIAW); } fs = FileSystem.get(URI.create("samplesMap.ser"), conf); samplesMapPath = new Path("samplesMap.ser"); FSDataOutputStream out = fs.create(samplesMapPath, true); map.write(out); out.sync(); out.close(); DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"), conf); // stop the sampling timer long sampling_end_time = System.nanoTime(); long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000; System.out.println("sampling runtime (milliseconds): " + sampling_runtime); break; // end switch case case 5: System.out.println("running random integer partition mapper..."); conf.setInputFormat(WholeSplitInputFormat.class); Path inputFilePath = new Path(args[8]); WholeSplitInputFormat.addInputPath(conf, inputFilePath); conf.setMapperClass(RandIntPartSamplerMapper.class); // Compute number of map tasks. fs = inputFilePath.getFileSystem(conf); FileStatus inputFileStatus = fs.getFileStatus(inputFilePath); long len = inputFileStatus.getLen(); long blockSize = inputFileStatus.getBlockSize(); conf.setLong("mapred.min.split.size", blockSize); conf.setLong("mapred.max.split.size", blockSize); int mapTasksNum = ((int) (len / blockSize)) + 1; conf.setNumMapTasks(mapTasksNum); //System.out.println("len: " + len + " blockSize: " // + blockSize + " mapTasksNum: " + mapTasksNum); // Extract random integer partition of total sample // size into up to mapTasksNum partitions. // XXX I'm not sure this is a correct way to do // it. rand = new Random(); IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples]; for (int j = 0; j < numSamples; j++) { IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum]; int sum = 0; int i; for (i = 0; i < mapTasksNum - 1; i++) { int size = rand.nextInt(sampleSize - sum); tempToSampleArr[i] = new IntWritable(size); sum += size; if (sum > numSamples * sampleSize) { System.out.println("Something went wrong generating the sample Sizes"); System.exit(1); } if (sum == sampleSize) { break; } } if (i == mapTasksNum - 1) { tempToSampleArr[i] = new IntWritable(sampleSize - sum); } else { for (; i < mapTasksNum; i++) { tempToSampleArr[i] = new IntWritable(0); } } Collections.shuffle(Arrays.asList(tempToSampleArr)); for (i = 0; i < mapTasksNum; i++) { toSampleArr[i][j] = tempToSampleArr[i]; } } for (int i = 0; i < mapTasksNum; i++) { DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i); } break; default: System.err.println("Wrong Mapper ID. Can only be in [1,5]"); System.exit(1); break; } /* * We don't use the default hash partitioner because we want to * maximize the parallelism. That's why we also fix the number * of reducers. */ conf.setPartitionerClass(FIMPartitioner.class); conf.setReducerClass(FIMReducer.class); /************************ Job 2 (aggregation) Configuration ************************/ JobConf confAggr = new JobConf(getConf()); confAggr.setInt("PARMM.reducersNum", numSamples); confAggr.setInt("PARMM.reqApproxNum", reqApproxNum); confAggr.setInt("PARMM.sampleSize", sampleSize); confAggr.setFloat("PARMM.epsilon", epsilon); // XXX: Why do we disable speculative execution? MR confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false); confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI); confAggr.setJarByClass(MRDriver.class); confAggr.setMapOutputKeyClass(Text.class); confAggr.setMapOutputValueClass(DoubleWritable.class); confAggr.setOutputKeyClass(Text.class); confAggr.setOutputValueClass(Text.class); confAggr.setMapperClass(AggregateMapper.class); confAggr.setReducerClass(AggregateReducer.class); confAggr.setInputFormat(CombineSequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9])); FileOutputFormat.setOutputPath(confAggr, new Path(args[10])); long FIMjob_start_time = System.currentTimeMillis(); RunningJob FIMjob = JobClient.runJob(conf); long FIMjob_end_time = System.currentTimeMillis(); RunningJob aggregateJob = JobClient.runJob(confAggr); long aggrJob_end_time = System.currentTimeMillis(); long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time; long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time; if (args[7].equals("4")) { // Remove samplesMap file fs.delete(samplesMapPath, false); } Counters counters = FIMjob.getCounters(); Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart"); long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()]; int i = 0; for (Counters.Counter counter : FIMMapperStartTimesCounters) { FIMMapperStartTimes[i++] = counter.getCounter(); } Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd"); long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMMapperEndTimesCounters) { FIMMapperEndTimes[i++] = counter.getCounter(); } Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart"); long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMReducerStartTimesCounters) { FIMReducerStartTimes[i++] = counter.getCounter(); } Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd"); long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMReducerEndTimesCounters) { FIMReducerEndTimes[i++] = counter.getCounter(); } Counters countersAggr = aggregateJob.getCounters(); Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart"); long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateMapperStartTimesCounters) { AggregateMapperStartTimes[i++] = counter.getCounter(); } Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd"); long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateMapperEndTimesCounters) { AggregateMapperEndTimes[i++] = counter.getCounter(); } Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart"); long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateReducerStartTimesCounters) { AggregateReducerStartTimes[i++] = counter.getCounter(); } Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd"); long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateReducerEndTimesCounters) { AggregateReducerEndTimes[i++] = counter.getCounter(); } long FIMMapperStartMin = FIMMapperStartTimes[0]; for (long l : FIMMapperStartTimes) { if (l < FIMMapperStartMin) { FIMMapperStartMin = l; } } long FIMMapperEndMax = FIMMapperEndTimes[0]; for (long l : FIMMapperEndTimes) { if (l > FIMMapperEndMax) { FIMMapperEndMax = l; } } System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time)); System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin)); long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length]; long FIMMapperRunTimesSum = 0; for (int l = 0; l < FIMMapperStartTimes.length; l++) { FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l]; FIMMapperRunTimesSum += FIMMapperRunTimes[l]; } System.out.println("FIMMapper average task runtime (milliseconds): " + FIMMapperRunTimesSum / FIMMapperStartTimes.length); long FIMMapperRunTimesMin = FIMMapperRunTimes[0]; long FIMMapperRunTimesMax = FIMMapperRunTimes[0]; for (long l : FIMMapperRunTimes) { if (l < FIMMapperRunTimesMin) { FIMMapperRunTimesMin = l; } if (l > FIMMapperRunTimesMax) { FIMMapperRunTimesMax = l; } } System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin); System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax); long FIMReducerStartMin = FIMReducerStartTimes[0]; for (long l : FIMReducerStartTimes) { if (l < FIMReducerStartMin) { FIMReducerStartMin = l; } } long FIMReducerEndMax = FIMReducerEndTimes[0]; for (long l : FIMReducerEndTimes) { if (l > FIMReducerEndMax) { FIMReducerEndMax = l; } } System.out .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax)); System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin)); long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length]; long FIMReducerRunTimesSum = 0; for (int l = 0; l < FIMReducerStartTimes.length; l++) { FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l]; FIMReducerRunTimesSum += FIMReducerRunTimes[l]; } System.out.println("FIMReducer average task runtime (milliseconds): " + FIMReducerRunTimesSum / FIMReducerStartTimes.length); long FIMReducerRunTimesMin = FIMReducerRunTimes[0]; long FIMReducerRunTimesMax = FIMReducerRunTimes[0]; for (long l : FIMReducerRunTimes) { if (l < FIMReducerRunTimesMin) { FIMReducerRunTimesMin = l; } if (l > FIMReducerRunTimesMax) { FIMReducerRunTimesMax = l; } } System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin); System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax); System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax)); long AggregateMapperStartMin = AggregateMapperStartTimes[0]; for (long l : AggregateMapperStartTimes) { if (l < AggregateMapperStartMin) { AggregateMapperStartMin = l; } } long AggregateMapperEndMax = AggregateMapperEndTimes[0]; for (long l : AggregateMapperEndTimes) { if (l > AggregateMapperEndMax) { AggregateMapperEndMax = l; } } System.out.println( "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time)); System.out.println("AggregateMapper total runtime (milliseconds): " + (AggregateMapperEndMax - AggregateMapperStartMin)); long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length]; long AggregateMapperRunTimesSum = 0; for (int l = 0; l < AggregateMapperStartTimes.length; l++) { AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l]; AggregateMapperRunTimesSum += AggregateMapperRunTimes[l]; } System.out.println("AggregateMapper average task runtime (milliseconds): " + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length); long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0]; long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0]; for (long l : AggregateMapperRunTimes) { if (l < AggregateMapperRunTimesMin) { AggregateMapperRunTimesMin = l; } if (l > AggregateMapperRunTimesMax) { AggregateMapperRunTimesMax = l; } } System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin); System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax); long AggregateReducerStartMin = AggregateReducerStartTimes[0]; for (long l : AggregateReducerStartTimes) { if (l < AggregateReducerStartMin) { AggregateReducerStartMin = l; } } long AggregateReducerEndMax = AggregateReducerEndTimes[0]; for (long l : AggregateReducerEndTimes) { if (l > AggregateReducerEndMax) { AggregateReducerEndMax = l; } } System.out.println("Aggregate job round shuffle phase runtime (milliseconds): " + (AggregateReducerStartMin - AggregateMapperEndMax)); System.out.println("AggregateReducer total runtime (milliseconds): " + (AggregateReducerEndMax - AggregateReducerStartMin)); long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length]; long AggregateReducerRunTimesSum = 0; for (int l = 0; l < AggregateReducerStartTimes.length; l++) { AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l]; AggregateReducerRunTimesSum += AggregateReducerRunTimes[l]; } System.out.println("AggregateReducer average task runtime (milliseconds): " + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length); long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0]; long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0]; for (long l : AggregateReducerRunTimes) { if (l < AggregateReducerRunTimesMin) { AggregateReducerRunTimesMin = l; } if (l > AggregateReducerRunTimesMax) { AggregateReducerRunTimesMax = l; } } System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin); System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax); System.out.println( "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax)); System.out .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time)); System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): " + (AggregateReducerEndMax - FIMMapperStartMin)); System.out.println("total runtime (no setups, no cooldowns) (milliseconds): " + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin)); System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime); System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): " + (FIMReducerEndMax - FIMMapperStartMin)); System.out.println( "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime); System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): " + (AggregateReducerEndMax - AggregateMapperStartMin)); return 0; }
From source file:FindMaxPageRankNodes.java
License:Apache License
/** * Runs this tool./* ww w . j a v a2 s . c om*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("top n").create(TOP)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(TOP)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(TOP)); LOG.info("Tool name: " + FindMaxPageRankNodes.class.getSimpleName()); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); LOG.info(" - top: " + n); Configuration conf = getConf(); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setInt("n", n); Job job = Job.getInstance(conf); job.setJobName(FindMaxPageRankNodes.class.getName() + ":" + inputPath); job.setJarByClass(FindMaxPageRankNodes.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(FloatWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:WriteFDFPerformance.java
License:Open Source License
static void writetxt() throws IOException { FSDataOutputStream fos = FileSystem.get(new Configuration()).create(new Path("txt/txt")); fos.writeBytes(String.valueOf(127) + "," + String.valueOf(1000) + "\r\n"); fos.writeBytes(String.valueOf(127) + "," + String.valueOf(1000) + "\r\n"); fos.writeBytes(String.valueOf(127) + "," + String.valueOf(1000) + "\r\n"); fos.close();/* w w w.ja v a2 s .c o m*/ }
From source file:Text2ColumntStorageMR.java
License:Open Source License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { if (args.length != 3) { System.out.println("Text2ColumnStorageMR <input> <output> <columnStorageMode>"); System.exit(-1);/* w w w . j a va 2 s.c o m*/ } JobConf conf = new JobConf(Text2ColumntStorageMR.class); conf.setJobName("Text2ColumnStorageMR"); conf.setNumMapTasks(1); conf.setNumReduceTasks(4); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Unit.Record.class); conf.setMapperClass(TextFileMapper.class); conf.setReducerClass(ColumnStorageReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat((Class<? extends OutputFormat>) ColumnStorageHiveOutputFormat.class); conf.set("mapred.output.compress", "flase"); Head head = new Head(); initHead(head); head.toJobConf(conf); int bt = Integer.valueOf(args[2]); FileInputFormat.setInputPaths(conf, args[0]); Path outputPath = new Path(args[1]); FileOutputFormat.setOutputPath(conf, outputPath); FileSystem fs = outputPath.getFileSystem(conf); fs.delete(outputPath, true); JobClient jc = new JobClient(conf); RunningJob rj = null; rj = jc.submitJob(conf); String lastReport = ""; SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS"); long reportTime = System.currentTimeMillis(); long maxReportInterval = 3 * 1000; while (!rj.isComplete()) { try { Thread.sleep(1000); } catch (InterruptedException e) { } int mapProgress = Math.round(rj.mapProgress() * 100); int reduceProgress = Math.round(rj.reduceProgress() * 100); String report = " map = " + mapProgress + "%, reduce = " + reduceProgress + "%"; if (!report.equals(lastReport) || System.currentTimeMillis() >= reportTime + maxReportInterval) { String output = dateFormat.format(Calendar.getInstance().getTime()) + report; System.out.println(output); lastReport = report; reportTime = System.currentTimeMillis(); } } System.exit(0); }
From source file:LookupPostings.java
License:Apache License
/** * Runs this tool./*from w w w . jav a 2 s. c o m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostings.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, value); ArrayListWritable<PairOfInts> postings = value.getRightElement(); for (PairOfInts pair : postings) { System.out.println(pair); collection.seek(pair.getLeftElement()); System.out.println(d.readLine()); } key.set("gold"); reader.get(key, value); System.out.println("Complete postings list for 'gold': " + value); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); postings = value.getRightElement(); for (PairOfInts pair : postings) { goldHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } key.set("silver"); reader.get(key, value); System.out.println("Complete postings list for 'silver': " + value); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); postings = value.getRightElement(); for (PairOfInts pair : postings) { silverHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : silverHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } key.set("bronze"); Writable w = reader.get(key, value); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); return 0; }
From source file:Importer.java
License:Open Source License
public static void copyFile(File file) throws Exception { // String TEST_PREFIX = ""; File destFile = new File(outDir, file.getName() + ".seq"); Path dest = new Path(destFile.getAbsolutePath()); Configuration conf = new Configuration(); FileSystem fileSys = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(conf.get("fs.default.name")), conf);/* w w w. j a v a 2 s .c o m*/ CompressionCodec codec = new DefaultCodec(); fileSys.mkdirs(dest.getParent()); FSDataOutputStream outputStr = fileSys.create(dest); seqFileWriter = SequenceFile.createWriter(conf, outputStr, Text.class, Text.class, SequenceFile.CompressionType.BLOCK, codec); String filename = file.getName(); InputStream in = new BufferedInputStream(new FileInputStream(file)); if (filename.endsWith(".bz2")) { in.read(); in.read(); //snarf header in = new CBZip2InputStream(in); } BufferedReader br = new BufferedReader(new InputStreamReader(in, "US-ASCII")); System.out.println("working on file " + file); int records = 0; long bytes = 0, bytes_since_status = 0; long startTime = System.currentTimeMillis(); String s = null; Text content = new Text(); while ((s = br.readLine()) != null) { if (s.startsWith("---END.OF.DOCUMENT---")) { Text name = new Text(hash(content)); seqFileWriter.append(name, content); records++; content = new Text(); } else { byte[] line_as_bytes = (s + " ").getBytes(); for (byte b : line_as_bytes) { assert b < 128 : "found an unexpected high-bit set"; } content.append(line_as_bytes, 0, line_as_bytes.length); bytes += line_as_bytes.length; /* bytes_since_status += line_as_bytes.length; if(bytes_since_status > 10 * 1024 * 1024) { //every 10 MB System.err.print('.'); bytes_since_status = 0; }*/ } } //end while if (content.getLength() > 5) { Text name = new Text(hash(content)); seqFileWriter.append(name, content); records++; } totalBytes += bytes; totalRecords += records; long time = (System.currentTimeMillis() - startTime) / 1000 + 1; long kbSec = bytes / 1024 / time; System.out.println(new java.util.Date()); System.out.println("File " + file.getName() + " " + records + " records, " + bytes + " bytes in " + time + " seconds (" + kbSec + " KB/sec)."); in.close(); seqFileWriter.close(); outputStr.close(); }
From source file:Vectors.java
License:Apache License
public static void main(String args[]) throws IOException { String pathString = "preference/part-r-00000"; Path examplar = new Path(pathString); Configuration conf = new Configuration(); // Vector vt=new RandomAccessSparseVector(3); // vt.set(0, 2.0); // VectorWritable vectorWritable=new VectorWritable(vt); // vectorWritable.setWritesLaxPrecision(false); // FileSystem fs=examplar.getFileSystem(conf); // FSDataOutputStream out=fs.append(examplar); // vectorWritable.write(out); for (int i = 0; i < 1; i++) { Vector vector = read(examplar, conf); System.out.println(vector); }/*from ww w .ja v a 2 s . c o m*/ }
From source file:GetUserInfoGivenMovieId.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("movieId", args[2]); Job job = new Job(conf, "GetUserInfoGivenMovieId"); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setJarByClass(GetUserInfoGivenMovieId.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[3])); boolean flag1 = job.waitForCompletion(true); if (flag1) {/* w w w.j a v a 2 s . co m*/ Configuration conf2 = new Configuration(); //FileSystem fs = FileSystem.get(conf2); //Path Intermediate = new Path(args[1]); //DistributedCache.addCacheFile(Intermediate.toUri(), conf2); //DistributedCache.addCacheFile(new URI(args[1]),conf2); Job job2 = new Job(conf2, "UserInfo"); /*Job job2 = new Job(new Configuration()); Configuration conf2 = job.getConfiguration(); job2.setJobName("Join with Cache"); DistributedCache.addCacheFile(new URI(args[1]), conf2);*/ job2.addCacheFile(new URI(args[1])); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); job2.setJarByClass(GetUserInfoGivenMovieId.class); job2.setMapperClass(MapWithJoin.class); //job2.setCombinerClass(Reduce.class); job2.setReducerClass(ReduceFinal.class); job2.setInputFormatClass(TextInputFormat.class); job2.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job2, new Path(args[3])); FileOutputFormat.setOutputPath(job2, new Path(args[4])); job2.waitForCompletion(true); } }
From source file:CrimenCasosTotales.java
public static void main(String args[]) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "casostotales"); job.setJarByClass(WordCount.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }