List of usage examples for org.apache.hadoop.io LongWritable LongWritable
public LongWritable(long value)
From source file:PiEstimator.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi/*from w w w. ja v a 2s. c om*/ */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException { // setup job conf jobConf.setJobName(PiEstimator.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); // setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { // generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } // start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + duration + " seconds"); // read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } // compute estimated value return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }
From source file:HistogramBucket.java
License:Apache License
@Override public void write(DataOutput d) throws IOException { attribute.write(d);/*from ww w . j av a2 s .c o m*/ LongWritable arraySize = new LongWritable(splits.size()); arraySize.write(d); for (DoubleWritable w : splits) { w.write(d); } }
From source file:FlintHadoopTest.java
License:Apache License
/** * Test the Map class/*from w w w . j a va2 s . c om*/ * @throws IOException * @throws InstantiationException * @throws IllegalAccessException */ @Test public void testMap() throws IOException, InstantiationException, IllegalAccessException { mapDriver.withInput(new LongWritable(0), new Text(testPdf1Path)); assertOutputMatchesRecord(mapDriver.run().get(0), testPdf1CheckResult, testPdf1Name); }
From source file:MRDriver.java
License:Apache License
public int run(String args[]) throws Exception { FileSystem fs = null;//from w w w .j a v a 2 s. c om Path samplesMapPath = null; float epsilon = Float.parseFloat(args[0]); double delta = Double.parseDouble(args[1]); int minFreqPercent = Integer.parseInt(args[2]); int d = Integer.parseInt(args[3]); int datasetSize = Integer.parseInt(args[4]); int numSamples = Integer.parseInt(args[5]); double phi = Double.parseDouble(args[6]); Random rand; /************************ Job 1 (local FIM) Configuration ************************/ JobConf conf = new JobConf(getConf()); /* * Compute the number of required "votes" for an itemsets to be * declared frequent */ // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2. int reqApproxNum = (int) Math .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1; int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi))); //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum); conf.setInt("PARMM.reducersNum", numSamples); conf.setInt("PARMM.datasetSize", datasetSize); conf.setInt("PARMM.minFreqPercent", minFreqPercent); conf.setInt("PARMM.sampleSize", sampleSize); conf.setFloat("PARMM.epsilon", epsilon); // Set the number of reducers equal to the number of samples, to // maximize parallelism. Required by our Partitioner. conf.setNumReduceTasks(numSamples); // XXX: why do we disable the speculative execution? MR conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI); /* * Enable compression of map output. * * We do it for this job and not for the aggregation one because * each mapper there only print out one record for each itemset, * so there isn't much to compress, I'd say. MR * * In Amazon MapReduce compression of the map output seems to be * happen by default and the Snappy codec is used, which is * extremely fast. */ conf.setBoolean("mapred.compress.map.output", true); //conf.setMapOutputCompressorClass(com.hadoop.compression.lzo.LzoCodec.class); conf.setJarByClass(MRDriver.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); // We write the collections found in a reducers as a SequenceFile conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9])); // set the mapper class based on command line option switch (Integer.parseInt(args[7])) { case 1: System.out.println("running partition mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(PartitionMapper.class); break; case 2: System.out.println("running binomial mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(BinomialSamplerMapper.class); break; case 3: System.out.println("running coin mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(CoinFlipSamplerMapper.class); case 4: System.out.println("running sampler mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(InputSamplerMapper.class); // create a random sample of size T*m rand = new Random(); long sampling_start_time = System.nanoTime(); int[] samples = new int[numSamples * sampleSize]; for (int i = 0; i < numSamples * sampleSize; i++) { samples[i] = rand.nextInt(datasetSize); } // for each key in the sample, create a list of all T samples to which this key belongs Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>(); for (int i = 0; i < numSamples * sampleSize; i++) { ArrayList<IntWritable> sampleIDs = null; LongWritable key = new LongWritable(samples[i]); if (hashTable.containsKey(key)) sampleIDs = hashTable.get(key); else sampleIDs = new ArrayList<IntWritable>(); sampleIDs.add(new IntWritable(i % numSamples)); hashTable.put(key, sampleIDs); } /* * Convert the Hastable to a MapWritable which we will * write to HDFS and distribute to all Mappers using * DistributedCache */ MapWritable map = new MapWritable(); for (LongWritable key : hashTable.keySet()) { ArrayList<IntWritable> sampleIDs = hashTable.get(key); IntArrayWritable sampleIDsIAW = new IntArrayWritable(); sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()])); map.put(key, sampleIDsIAW); } fs = FileSystem.get(URI.create("samplesMap.ser"), conf); samplesMapPath = new Path("samplesMap.ser"); FSDataOutputStream out = fs.create(samplesMapPath, true); map.write(out); out.sync(); out.close(); DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"), conf); // stop the sampling timer long sampling_end_time = System.nanoTime(); long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000; System.out.println("sampling runtime (milliseconds): " + sampling_runtime); break; // end switch case case 5: System.out.println("running random integer partition mapper..."); conf.setInputFormat(WholeSplitInputFormat.class); Path inputFilePath = new Path(args[8]); WholeSplitInputFormat.addInputPath(conf, inputFilePath); conf.setMapperClass(RandIntPartSamplerMapper.class); // Compute number of map tasks. fs = inputFilePath.getFileSystem(conf); FileStatus inputFileStatus = fs.getFileStatus(inputFilePath); long len = inputFileStatus.getLen(); long blockSize = inputFileStatus.getBlockSize(); conf.setLong("mapred.min.split.size", blockSize); conf.setLong("mapred.max.split.size", blockSize); int mapTasksNum = ((int) (len / blockSize)) + 1; conf.setNumMapTasks(mapTasksNum); //System.out.println("len: " + len + " blockSize: " // + blockSize + " mapTasksNum: " + mapTasksNum); // Extract random integer partition of total sample // size into up to mapTasksNum partitions. // XXX I'm not sure this is a correct way to do // it. rand = new Random(); IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples]; for (int j = 0; j < numSamples; j++) { IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum]; int sum = 0; int i; for (i = 0; i < mapTasksNum - 1; i++) { int size = rand.nextInt(sampleSize - sum); tempToSampleArr[i] = new IntWritable(size); sum += size; if (sum > numSamples * sampleSize) { System.out.println("Something went wrong generating the sample Sizes"); System.exit(1); } if (sum == sampleSize) { break; } } if (i == mapTasksNum - 1) { tempToSampleArr[i] = new IntWritable(sampleSize - sum); } else { for (; i < mapTasksNum; i++) { tempToSampleArr[i] = new IntWritable(0); } } Collections.shuffle(Arrays.asList(tempToSampleArr)); for (i = 0; i < mapTasksNum; i++) { toSampleArr[i][j] = tempToSampleArr[i]; } } for (int i = 0; i < mapTasksNum; i++) { DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i); } break; default: System.err.println("Wrong Mapper ID. Can only be in [1,5]"); System.exit(1); break; } /* * We don't use the default hash partitioner because we want to * maximize the parallelism. That's why we also fix the number * of reducers. */ conf.setPartitionerClass(FIMPartitioner.class); conf.setReducerClass(FIMReducer.class); /************************ Job 2 (aggregation) Configuration ************************/ JobConf confAggr = new JobConf(getConf()); confAggr.setInt("PARMM.reducersNum", numSamples); confAggr.setInt("PARMM.reqApproxNum", reqApproxNum); confAggr.setInt("PARMM.sampleSize", sampleSize); confAggr.setFloat("PARMM.epsilon", epsilon); // XXX: Why do we disable speculative execution? MR confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false); confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI); confAggr.setJarByClass(MRDriver.class); confAggr.setMapOutputKeyClass(Text.class); confAggr.setMapOutputValueClass(DoubleWritable.class); confAggr.setOutputKeyClass(Text.class); confAggr.setOutputValueClass(Text.class); confAggr.setMapperClass(AggregateMapper.class); confAggr.setReducerClass(AggregateReducer.class); confAggr.setInputFormat(CombineSequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9])); FileOutputFormat.setOutputPath(confAggr, new Path(args[10])); long FIMjob_start_time = System.currentTimeMillis(); RunningJob FIMjob = JobClient.runJob(conf); long FIMjob_end_time = System.currentTimeMillis(); RunningJob aggregateJob = JobClient.runJob(confAggr); long aggrJob_end_time = System.currentTimeMillis(); long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time; long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time; if (args[7].equals("4")) { // Remove samplesMap file fs.delete(samplesMapPath, false); } Counters counters = FIMjob.getCounters(); Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart"); long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()]; int i = 0; for (Counters.Counter counter : FIMMapperStartTimesCounters) { FIMMapperStartTimes[i++] = counter.getCounter(); } Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd"); long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMMapperEndTimesCounters) { FIMMapperEndTimes[i++] = counter.getCounter(); } Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart"); long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMReducerStartTimesCounters) { FIMReducerStartTimes[i++] = counter.getCounter(); } Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd"); long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMReducerEndTimesCounters) { FIMReducerEndTimes[i++] = counter.getCounter(); } Counters countersAggr = aggregateJob.getCounters(); Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart"); long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateMapperStartTimesCounters) { AggregateMapperStartTimes[i++] = counter.getCounter(); } Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd"); long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateMapperEndTimesCounters) { AggregateMapperEndTimes[i++] = counter.getCounter(); } Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart"); long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateReducerStartTimesCounters) { AggregateReducerStartTimes[i++] = counter.getCounter(); } Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd"); long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateReducerEndTimesCounters) { AggregateReducerEndTimes[i++] = counter.getCounter(); } long FIMMapperStartMin = FIMMapperStartTimes[0]; for (long l : FIMMapperStartTimes) { if (l < FIMMapperStartMin) { FIMMapperStartMin = l; } } long FIMMapperEndMax = FIMMapperEndTimes[0]; for (long l : FIMMapperEndTimes) { if (l > FIMMapperEndMax) { FIMMapperEndMax = l; } } System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time)); System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin)); long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length]; long FIMMapperRunTimesSum = 0; for (int l = 0; l < FIMMapperStartTimes.length; l++) { FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l]; FIMMapperRunTimesSum += FIMMapperRunTimes[l]; } System.out.println("FIMMapper average task runtime (milliseconds): " + FIMMapperRunTimesSum / FIMMapperStartTimes.length); long FIMMapperRunTimesMin = FIMMapperRunTimes[0]; long FIMMapperRunTimesMax = FIMMapperRunTimes[0]; for (long l : FIMMapperRunTimes) { if (l < FIMMapperRunTimesMin) { FIMMapperRunTimesMin = l; } if (l > FIMMapperRunTimesMax) { FIMMapperRunTimesMax = l; } } System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin); System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax); long FIMReducerStartMin = FIMReducerStartTimes[0]; for (long l : FIMReducerStartTimes) { if (l < FIMReducerStartMin) { FIMReducerStartMin = l; } } long FIMReducerEndMax = FIMReducerEndTimes[0]; for (long l : FIMReducerEndTimes) { if (l > FIMReducerEndMax) { FIMReducerEndMax = l; } } System.out .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax)); System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin)); long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length]; long FIMReducerRunTimesSum = 0; for (int l = 0; l < FIMReducerStartTimes.length; l++) { FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l]; FIMReducerRunTimesSum += FIMReducerRunTimes[l]; } System.out.println("FIMReducer average task runtime (milliseconds): " + FIMReducerRunTimesSum / FIMReducerStartTimes.length); long FIMReducerRunTimesMin = FIMReducerRunTimes[0]; long FIMReducerRunTimesMax = FIMReducerRunTimes[0]; for (long l : FIMReducerRunTimes) { if (l < FIMReducerRunTimesMin) { FIMReducerRunTimesMin = l; } if (l > FIMReducerRunTimesMax) { FIMReducerRunTimesMax = l; } } System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin); System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax); System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax)); long AggregateMapperStartMin = AggregateMapperStartTimes[0]; for (long l : AggregateMapperStartTimes) { if (l < AggregateMapperStartMin) { AggregateMapperStartMin = l; } } long AggregateMapperEndMax = AggregateMapperEndTimes[0]; for (long l : AggregateMapperEndTimes) { if (l > AggregateMapperEndMax) { AggregateMapperEndMax = l; } } System.out.println( "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time)); System.out.println("AggregateMapper total runtime (milliseconds): " + (AggregateMapperEndMax - AggregateMapperStartMin)); long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length]; long AggregateMapperRunTimesSum = 0; for (int l = 0; l < AggregateMapperStartTimes.length; l++) { AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l]; AggregateMapperRunTimesSum += AggregateMapperRunTimes[l]; } System.out.println("AggregateMapper average task runtime (milliseconds): " + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length); long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0]; long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0]; for (long l : AggregateMapperRunTimes) { if (l < AggregateMapperRunTimesMin) { AggregateMapperRunTimesMin = l; } if (l > AggregateMapperRunTimesMax) { AggregateMapperRunTimesMax = l; } } System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin); System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax); long AggregateReducerStartMin = AggregateReducerStartTimes[0]; for (long l : AggregateReducerStartTimes) { if (l < AggregateReducerStartMin) { AggregateReducerStartMin = l; } } long AggregateReducerEndMax = AggregateReducerEndTimes[0]; for (long l : AggregateReducerEndTimes) { if (l > AggregateReducerEndMax) { AggregateReducerEndMax = l; } } System.out.println("Aggregate job round shuffle phase runtime (milliseconds): " + (AggregateReducerStartMin - AggregateMapperEndMax)); System.out.println("AggregateReducer total runtime (milliseconds): " + (AggregateReducerEndMax - AggregateReducerStartMin)); long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length]; long AggregateReducerRunTimesSum = 0; for (int l = 0; l < AggregateReducerStartTimes.length; l++) { AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l]; AggregateReducerRunTimesSum += AggregateReducerRunTimes[l]; } System.out.println("AggregateReducer average task runtime (milliseconds): " + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length); long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0]; long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0]; for (long l : AggregateReducerRunTimes) { if (l < AggregateReducerRunTimesMin) { AggregateReducerRunTimesMin = l; } if (l > AggregateReducerRunTimesMax) { AggregateReducerRunTimesMax = l; } } System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin); System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax); System.out.println( "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax)); System.out .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time)); System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): " + (AggregateReducerEndMax - FIMMapperStartMin)); System.out.println("total runtime (no setups, no cooldowns) (milliseconds): " + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin)); System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime); System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): " + (FIMReducerEndMax - FIMMapperStartMin)); System.out.println( "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime); System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): " + (AggregateReducerEndMax - AggregateMapperStartMin)); return 0; }
From source file:alluxio.client.hadoop.DFSIOIntegrationTest.java
License:Apache License
@SuppressWarnings("deprecation") private void createControlFile(org.apache.hadoop.fs.FileSystem fs, long nrBytes, // in bytes int nrFiles) throws IOException { LOG.info("creating control file: " + nrBytes + " bytes, " + nrFiles + " files"); Path controlDir = getControlDir(mConfig); if (!fs.exists(controlDir)) { fs.delete(controlDir, true);/*from w w w . j a v a2s . c om*/ for (int i = 0; i < nrFiles; i++) { String name = getFileName(i); Path controlFile = new Path(controlDir, "in_file_" + name); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter(fs, mConfig, controlFile, Text.class, LongWritable.class, CompressionType.NONE); writer.append(new Text(name), new LongWritable(nrBytes)); } catch (Exception e) { throw new IOException(e.getLocalizedMessage()); } finally { if (writer != null) { writer.close(); } writer = null; } } } LOG.info("created control files for: " + nrFiles + " files"); }
From source file:Assignment4_P4_MemoryConscious.MovingRatingMemConscious_Combiner.java
public void reduce(IntWritable key, Iterable<SortedMapWritable> values, Context context) throws IOException, InterruptedException { // loop through each hashmap for this movie id for (SortedMapWritable val : values) { // inside each hashmap, loop for every entry for (Map.Entry<WritableComparable, Writable> entry : val.entrySet()) { // check if current entry's key is already present in new hashmap if (result.containsKey(entry.getKey())) { //if yes, extract current value from result hashmap for this key LongWritable existingValue = (LongWritable) result.get(entry.getKey()); // increment existing value by 1 existingValue.set(existingValue.get() + 1); // update result hashmap with new value result.put(entry.getKey(), existingValue); } else { //if not, create new entry with init value 1 result.put(entry.getKey(), new LongWritable(1)); }// w w w .ja v a 2 s . com } val.clear(); } context.write(key, result); }
From source file:at.illecker.hama.hybrid.examples.onlinecf.OnlineCFHybridBenchmark.java
License:Apache License
public static List<double[]> generateRandomInputData(Configuration conf, FileSystem fs, Path in, int numBspTask, int numGPUBspTask, int userCount, int itemCount, int percentNonZeroValues, int GPUPercentage, int maxTestPrefs) throws IOException { // Delete input directory if already exist if (fs.exists(in)) { fs.delete(in, true);/*from w ww. j av a 2s . c o m*/ } Random rand = new Random(32L); Set<Map.Entry<Long, Long>> userItemPairs = new HashSet<Map.Entry<Long, Long>>(); List<double[]> testItems = new ArrayList<double[]>(); int possibleUserItemRatings = userCount * itemCount; int userItemRatings = possibleUserItemRatings * percentNonZeroValues / 100; System.out.println("generateRandomInputData possibleRatings: " + possibleUserItemRatings + " ratings: " + userItemRatings); // Compute work distributions int cpuTaskNum = numBspTask - numGPUBspTask; long ratingsPerGPUTask = 0; long ratingsPerCPU = 0; long ratingsPerCPUTask = 0; if ((numGPUBspTask > 0) && (GPUPercentage > 0) && (GPUPercentage <= 100)) { ratingsPerGPUTask = (userItemRatings * GPUPercentage) / 100; ratingsPerCPU = userItemRatings - ratingsPerGPUTask; } else { ratingsPerCPU = userItemRatings; } if (cpuTaskNum > 0) { ratingsPerCPUTask = ratingsPerCPU / cpuTaskNum; } System.out.println("generateRandomInputData ratingsPerGPUTask: " + ratingsPerGPUTask + " ratingsPerCPU: " + ratingsPerCPU + " ratingsPerCPUTask: " + ratingsPerCPUTask); for (int part = 0; part < numBspTask; part++) { Path partIn = new Path(in, "part" + part + ".seq"); final SequenceFile.Writer dataWriter = SequenceFile.createWriter(fs, conf, partIn, LongWritable.class, PipesVectorWritable.class, CompressionType.NONE); long interval = 0; if (part > cpuTaskNum) { interval = ratingsPerGPUTask; } else { interval = ratingsPerCPUTask; } long start = interval * part; long end = start + interval - 1; if ((numBspTask - 1) == part) { end = userItemRatings; } System.out.println("Partition " + part + ": from " + start + " to " + end); for (long i = start; i <= end; i++) { // Find new user item rating which was not used before Map.Entry<Long, Long> userItemPair; do { long userId = rand.nextInt(userCount); long itemId = rand.nextInt(itemCount); userItemPair = new AbstractMap.SimpleImmutableEntry<Long, Long>(userId, itemId); } while (userItemPairs.contains(userItemPair)); // Add user item rating userItemPairs.add(userItemPair); // Generate rating int rating = rand.nextInt(5) + 1; // values between 1 and 5 // Add user item rating to test data if (i < maxTestPrefs) { testItems.add(new double[] { userItemPair.getKey(), userItemPair.getValue(), rating }); } // Write out user item rating dataWriter.append(new LongWritable(userItemPair.getKey()), new PipesVectorWritable( new DenseDoubleVector(new double[] { userItemPair.getValue(), rating }))); } dataWriter.close(); } return testItems; }
From source file:at.illecker.hama.hybrid.examples.onlinecf.OnlineCFHybridBenchmark.java
License:Apache License
public static List<double[]> convertInputData(Configuration conf, FileSystem fs, Path in, Path preferencesIn, String inputFile, String separator, int maxTestPrefs) throws IOException { List<double[]> testItems = new ArrayList<double[]>(); // Delete input files if already exist if (fs.exists(in)) { fs.delete(in, true);// ww w. j ava2s . com } if (fs.exists(preferencesIn)) { fs.delete(preferencesIn, true); } final SequenceFile.Writer prefWriter = SequenceFile.createWriter(fs, conf, preferencesIn, LongWritable.class, PipesVectorWritable.class, CompressionType.NONE); BufferedReader br = new BufferedReader(new FileReader(inputFile)); String line; while ((line = br.readLine()) != null) { String[] values = line.split(separator); long userId = Long.parseLong(values[0]); long itemId = Long.parseLong(values[1]); double rating = Double.parseDouble(values[2]); // System.out.println("userId: " + userId + " itemId: " + itemId // + " rating: " + rating); double vector[] = new double[2]; vector[0] = itemId; vector[1] = rating; prefWriter.append(new LongWritable(userId), new PipesVectorWritable(new DenseDoubleVector(vector))); // Add test preferences maxTestPrefs--; if (maxTestPrefs > 0) { testItems.add(new double[] { userId, itemId, rating }); } } br.close(); prefWriter.close(); return testItems; }
From source file:at.illecker.hama.hybrid.examples.onlinecf.OnlineCFTrainHybridBSP.java
License:Apache License
public static List<Preference<Long, Long>> prepareTestInputData(Configuration conf, FileSystem fs, Path in, Path preferencesIn) throws IOException { Preference[] train_prefs = { new Preference<Integer, Integer>(1, 0, 4), new Preference<Integer, Integer>(1, 1, 2.5), new Preference<Integer, Integer>(1, 2, 3.5), new Preference<Integer, Integer>(2, 0, 4), new Preference<Integer, Integer>(2, 1, 2.5), new Preference<Integer, Integer>(2, 2, 3.5), new Preference<Integer, Integer>(2, 3, 1), new Preference<Integer, Integer>(2, 4, 3.5), new Preference<Integer, Integer>(3, 0, 4), new Preference<Integer, Integer>(3, 1, 2.5), new Preference<Integer, Integer>(3, 2, 3.5), new Preference<Integer, Integer>(3, 3, 1), new Preference<Integer, Integer>(3, 4, 3.5) }; List<Preference<Long, Long>> test_prefs = new ArrayList<Preference<Long, Long>>(); test_prefs.add(new Preference<Long, Long>(1l, 0l, 4)); test_prefs.add(new Preference<Long, Long>(1l, 1l, 2.5)); test_prefs.add(new Preference<Long, Long>(1l, 2l, 3.5)); test_prefs.add(new Preference<Long, Long>(1l, 3l, 1)); test_prefs.add(new Preference<Long, Long>(1l, 4l, 3.5)); // Delete input files if already exist if (fs.exists(in)) { fs.delete(in, true);//from w w w .j a v a 2 s .c o m } if (fs.exists(preferencesIn)) { fs.delete(preferencesIn, true); } final SequenceFile.Writer prefWriter = SequenceFile.createWriter(fs, conf, preferencesIn, LongWritable.class, PipesVectorWritable.class, CompressionType.NONE); for (Preference<Integer, Integer> taste : train_prefs) { double values[] = new double[2]; values[0] = taste.getItemId(); values[1] = taste.getValue().get(); prefWriter.append(new LongWritable(taste.getUserId()), new PipesVectorWritable(new DenseDoubleVector(values))); } prefWriter.close(); return test_prefs; }
From source file:at.illecker.hama.hybrid.examples.onlinecf.OnlineCFTrainHybridBSP.java
License:Apache License
public static List<Preference<Long, Long>> generateRandomInputData(Configuration conf, FileSystem fs, Path in, int numBspTask, int numGPUBspTask, int userCount, int itemCount, int percentNonZeroValues, int GPUPercentage, int maxTestPrefs) throws IOException { // Delete input directory if already exist if (fs.exists(in)) { fs.delete(in, true);// ww w.ja va 2 s . c o m } Random rand = new Random(32L); Set<Map.Entry<Long, Long>> userItemPairs = new HashSet<Map.Entry<Long, Long>>(); List<Preference<Long, Long>> testItems = new ArrayList<Preference<Long, Long>>(); int possibleUserItemRatings = userCount * itemCount; int userItemRatings = possibleUserItemRatings * percentNonZeroValues / 100; System.out.println("generateRandomInputData possibleRatings: " + possibleUserItemRatings + " ratings: " + userItemRatings); // Compute work distributions int cpuTaskNum = numBspTask - numGPUBspTask; long ratingsPerGPUTask = 0; long ratingsPerCPU = 0; long ratingsPerCPUTask = 0; if ((numGPUBspTask > 0) && (GPUPercentage > 0) && (GPUPercentage <= 100)) { ratingsPerGPUTask = (userItemRatings * GPUPercentage) / 100; ratingsPerCPU = userItemRatings - ratingsPerGPUTask; } else { ratingsPerCPU = userItemRatings; } if (cpuTaskNum > 0) { ratingsPerCPUTask = ratingsPerCPU / cpuTaskNum; } System.out.println("generateRandomInputData ratingsPerGPUTask: " + ratingsPerGPUTask + " ratingsPerCPU: " + ratingsPerCPU + " ratingsPerCPUTask: " + ratingsPerCPUTask); for (int part = 0; part < numBspTask; part++) { Path partIn = new Path(in, "part" + part + ".seq"); final SequenceFile.Writer dataWriter = SequenceFile.createWriter(fs, conf, partIn, LongWritable.class, PipesVectorWritable.class, CompressionType.NONE); long interval = 0; if (part > cpuTaskNum) { interval = ratingsPerGPUTask; } else { interval = ratingsPerCPUTask; } long start = interval * part; long end = start + interval - 1; if ((numBspTask - 1) == part) { end = userItemRatings; } LOG.info("Partition " + part + ": from " + start + " to " + end); for (long i = start; i <= end; i++) { // Find new user item rating which was not used before Map.Entry<Long, Long> userItemPair; do { long userId = rand.nextInt(userCount); long itemId = rand.nextInt(itemCount); userItemPair = new AbstractMap.SimpleImmutableEntry<Long, Long>(userId, itemId); } while (userItemPairs.contains(userItemPair)); // Add user item rating userItemPairs.add(userItemPair); // Generate rating int rating = rand.nextInt(5) + 1; // values between 1 and 5 // Add user item rating to test data if (i < maxTestPrefs) { testItems.add( new Preference<Long, Long>(userItemPair.getKey(), userItemPair.getValue(), rating)); } // Write out user item rating dataWriter.append(new LongWritable(userItemPair.getKey()), new PipesVectorWritable( new DenseDoubleVector(new double[] { userItemPair.getValue(), rating }))); } dataWriter.close(); } return testItems; }