List of usage examples for org.apache.hadoop.io DoubleWritable DoubleWritable
public DoubleWritable()
From source file:org.apache.mahout.utils.vectors.text.DictionaryVectorizer.java
License:Apache License
/** * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across * multiple map/reduces./* ww w. j a v a2 s . com*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param output * output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document * are generated * @param minSupport * the minimum frequency of the feature in the entire corpus to be considered for inclusion in the * sparse vector * @param maxNGramSize * 1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram * @param minLLRValue * minValue of log likelihood ratio to used to prune ngrams * @param chunkSizeInMegabytes * the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce * stage. Its recommended you calculated this based on the number of cores and the free memory * available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we * recommend you use a split size of around 400-500MB so that two simultaneous reducers can create * partial vectors without thrashing the system due to increased swapping * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void createTermFrequencyVectors(Path input, Path output, Configuration baseConf, int minSupport, int maxNGramSize, float minLLRValue, int numReducers, int chunkSizeInMegabytes, boolean sequentialAccess) throws IOException, InterruptedException, ClassNotFoundException { if (chunkSizeInMegabytes < MIN_CHUNKSIZE) { chunkSizeInMegabytes = MIN_CHUNKSIZE; } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB chunkSizeInMegabytes = MAX_CHUNKSIZE; } if (minSupport < 0) { minSupport = DEFAULT_MIN_SUPPORT; } Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER); int[] maxTermDimension = new int[1]; List<Path> dictionaryChunks; if (maxNGramSize == 1) { startWordCounting(input, dictionaryJobPath, minSupport); dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath, output, chunkSizeInMegabytes, new LongWritable(), maxTermDimension); } else { CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue, numReducers); dictionaryChunks = createDictionaryChunks(minSupport, new Path(new Path(output, DICTIONARY_JOB_FOLDER), CollocDriver.NGRAM_OUTPUT_DIRECTORY), output, chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension); } int partialVectorIndex = 0; List<Path> partialVectorPaths = new ArrayList<Path>(); for (Path dictionaryChunk : dictionaryChunks) { Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++); partialVectorPaths.add(partialVectorOutputPath); makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath, maxTermDimension[0], sequentialAccess, numReducers); } Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf); Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER); if (dictionaryChunks.size() > 1) { PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0], sequentialAccess, numReducers); HadoopUtil.deletePaths(partialVectorPaths, fs); } else { Path singlePartialVectorOutputPath = partialVectorPaths.get(0); fs.delete(outputDir, true); fs.rename(singlePartialVectorOutputPath, outputDir); } }
From source file:org.apache.orc.mapred.OrcMapredRecordReader.java
License:Apache License
static DoubleWritable nextDouble(ColumnVector vector, int row, Object previous) { if (vector.isRepeating) { row = 0;/*from ww w.j a va 2s . c om*/ } if (vector.noNulls || !vector.isNull[row]) { DoubleWritable result; if (previous == null || previous.getClass() != DoubleWritable.class) { result = new DoubleWritable(); } else { result = (DoubleWritable) previous; } result.set(((DoubleColumnVector) vector).vector[row]); return result; } else { return null; } }
From source file:org.apache.orc.mapred.OrcStruct.java
License:Apache License
public static WritableComparable createValue(TypeDescription type) { switch (type.getCategory()) { case BOOLEAN: return new BooleanWritable(); case BYTE://www . j ava 2s . com return new ByteWritable(); case SHORT: return new ShortWritable(); case INT: return new IntWritable(); case LONG: return new LongWritable(); case FLOAT: return new FloatWritable(); case DOUBLE: return new DoubleWritable(); case BINARY: return new BytesWritable(); case CHAR: case VARCHAR: case STRING: return new Text(); case DATE: return new DateWritable(); case TIMESTAMP: return new OrcTimestamp(); case DECIMAL: return new HiveDecimalWritable(); case STRUCT: { OrcStruct result = new OrcStruct(type); int c = 0; for (TypeDescription child : type.getChildren()) { result.setFieldValue(c++, createValue(child)); } return result; } case UNION: return new OrcUnion(type); case LIST: return new OrcList(type); case MAP: return new OrcMap(type); default: throw new IllegalArgumentException("Unknown type " + type); } }
From source file:org.apache.sysml.runtime.util.MapReduceTool.java
License:Apache License
public static double[] pickValueWeight(String dir, MetaDataNumItemsByEachReducer metadata, double p, boolean average) throws IOException { long[] counts = metadata.getNumItemsArray(); long[] ranges = new long[counts.length]; ranges[0] = counts[0];//from w ww . ja v a2s . c o m for (int i = 1; i < counts.length; i++) ranges[i] = ranges[i - 1] + counts[i]; long total = ranges[ranges.length - 1]; // do averaging only if it is asked for; and sum_wt is even average = average && (total % 2 == 0); int currentPart = 0; double cum_weight = 0; long pos = (long) Math.ceil(total * p); while (ranges[currentPart] < pos) { currentPart++; cum_weight += ranges[currentPart]; } int offset; if (currentPart > 0) offset = (int) (pos - ranges[currentPart - 1] - 1); else offset = (int) pos - 1; Path path = new Path(dir); FileSystem fs = IOUtilFunctions.getFileSystem(path); FileStatus[] files = fs.listStatus(path); Path fileToRead = null; for (FileStatus file : files) if (file.getPath().toString().endsWith(Integer.toString(currentPart))) { fileToRead = file.getPath(); break; } if (fileToRead == null) throw new RuntimeException("cannot read partition " + currentPart); int buffsz = 64 * 1024; DoubleWritable readKey = new DoubleWritable(); IntWritable readValue = new IntWritable(); FSDataInputStream currentStream = null; double ret = -1; try { currentStream = fs.open(fileToRead, buffsz); boolean contain0s = false; long numZeros = 0; if (currentPart == metadata.getPartitionOfZero()) { contain0s = true; numZeros = metadata.getNumberOfZero(); } ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros); int numRead = 0; while (numRead <= offset) { reader.readNextKeyValuePairs(readKey, readValue); numRead += readValue.get(); cum_weight += readValue.get(); } ret = readKey.get(); if (average) { if (numRead <= offset + 1) { reader.readNextKeyValuePairs(readKey, readValue); cum_weight += readValue.get(); ret = (ret + readKey.get()) / 2; } } } finally { IOUtilFunctions.closeSilently(currentStream); } return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) }; }
From source file:org.kiji.examples.wikipediarank.RankRedistributor.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w w w. j a v a 2 s. c om*/ public void setup(GathererContext<Text, DoubleWritable> context) throws IOException { super.setup(context); mLink = new Text(); mRank = new DoubleWritable(); }
From source file:org.mrgeo.pdf.TriangularDistributionPdfCurve.java
License:Apache License
public void writePdfCurve(Path output, Configuration conf) throws IOException { FileSystem fs = output.getFileSystem(conf); Path outputFile = new Path(output, "part-r-00000"); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outputFile, DoubleWritable.class, DoubleWritable.class); DoubleWritable key = new DoubleWritable(); DoubleWritable value = new DoubleWritable(); double resolution = (_max - _min) / _bin; double binNumber = _min; for (int i = 0; i < _bin; i++) { double likelihood = getLikelihood(binNumber); key.set(binNumber);/*from w w w . j a v a 2 s.com*/ value.set(likelihood); writer.append(key, value); binNumber += resolution; } writer.close(); }
From source file:org.mrgeo.pdf.TriangularDistributionPdfCurve.java
License:Apache License
private void _computeCurve(Path[] pdfFiles, Configuration conf) throws IOException { SequenceFile.Reader r = null; _likelihoods = new double[(int) _bin]; int index = 0; try {//from w w w . j av a2 s . c om // Loop through each of the output files from the reduce to process all of // the PDF histogram bins for (Path pdfFile : pdfFiles) { // ignore all the non-part files if (!pdfFile.getName().startsWith("part")) { continue; } r = new SequenceFile.Reader(pdfFile.getFileSystem(conf), pdfFile, conf); DoubleWritable key = new DoubleWritable(); DoubleWritable value = new DoubleWritable(); while (r.next(key, value)) { _likelihoods[index] = value.get(); index++; } } } finally { IOUtils.closeStream(r); } }
From source file:org.pentaho.hadoop.mapreduce.converter.converters.KettleTypeToDoubleWritableConverter.java
License:Apache License
@Override public DoubleWritable convert(ValueMetaInterface meta, Object obj) throws TypeConversionException { try {/*from w ww .j ava 2s . c om*/ DoubleWritable result = new DoubleWritable(); result.set(meta.getNumber(obj)); return result; } catch (KettleValueException ex) { throw new TypeConversionException(BaseMessages.getString(TypeConverterFactory.class, "ErrorConverting", DoubleWritable.class.getSimpleName(), obj), ex); } }
From source file:org.shaf.core.util.IOUtils.java
License:Apache License
/** * Reads an {@link Object} of the specified type from the {@link DataInput}. * /* ww w . j a v a2 s.c o m*/ * @param cls * the type of the reading object. * @param in * the data input stream. * @return the read object. * @throws IOException * if I/O error occurs. */ public static final Object readObject(Class<?> cls, DataInput in) throws IOException { try { if (cls == null) { throw new IOException("Reading class is not defined: null."); } else if (ClassUtils.isBoolean(cls)) { BooleanWritable obj = new BooleanWritable(); obj.readFields(in); return obj.get(); } else if (ClassUtils.isByte(cls)) { ByteWritable obj = new ByteWritable(); obj.readFields(in); return obj.get(); } else if (ClassUtils.isShort(cls)) { ShortWritable obj = new ShortWritable(); obj.readFields(in); return obj.get(); } else if (ClassUtils.isInteger(cls)) { IntWritable obj = new IntWritable(); obj.readFields(in); return obj.get(); } else if (ClassUtils.isLong(cls)) { LongWritable obj = new LongWritable(); obj.readFields(in); return obj.get(); } else if (ClassUtils.isFloat(cls)) { FloatWritable obj = new FloatWritable(); obj.readFields(in); return obj.get(); } else if (ClassUtils.isDouble(cls)) { DoubleWritable obj = new DoubleWritable(); obj.readFields(in); return obj.get(); } else if (ClassUtils.isString(cls)) { return Text.readString(in); } else if (ClassUtils.isEnum(cls)) { IntWritable obj = new IntWritable(); obj.readFields(in); return cls.getEnumConstants()[obj.get()]; } else if (ClassUtils.isArray(cls)) { int length = (int) readObject(int.class, in); Object array = Array.newInstance(cls.getComponentType(), length); for (int j = 0; j < length; j++) { Object a = readObject(cls.getComponentType(), in); Array.set(array, j, a); } return array; } else { Object obj = cls.newInstance(); ((Writable) obj).readFields(in); return obj; } } catch (IllegalArgumentException | InstantiationException | IllegalAccessException exc) { throw new IOException(exc); } }
From source file:org.shaf.core.util.IOUtilsTest.java
License:Apache License
/** * Test writing of {@code double} value. *//* w w w . ja va 2 s . com*/ @Test public void testWriteDouble() { byte[] buf = null; double value = 123.456; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream out = new DataOutputStream(baos);) { IOUtils.writeObject(value, out); buf = baos.toByteArray(); } catch (IOException exc) { fail(exc.getMessage()); } try (ByteArrayInputStream bais = new ByteArrayInputStream(buf); DataInputStream in = new DataInputStream(bais);) { DoubleWritable probe = new DoubleWritable(); probe.readFields(in); assertEquals(value, probe.get(), 0.0001); } catch (IOException exc) { fail(exc.getMessage()); } }