Example usage for org.apache.hadoop.io DoubleWritable DoubleWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io DoubleWritable DoubleWritable.

Prototype

public DoubleWritable()

Source Link

Usage

From source file:org.apache.mahout.utils.vectors.text.DictionaryVectorizer.java

License:Apache License

/**
 * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This
 * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across
 * multiple map/reduces./* ww w.  j a v  a2  s  . com*/
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param output
 *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
 *          are generated
 * @param minSupport
 *          the minimum frequency of the feature in the entire corpus to be considered for inclusion in the
 *          sparse vector
 * @param maxNGramSize
 *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram
 * @param minLLRValue
 *          minValue of log likelihood ratio to used to prune ngrams
 * @param chunkSizeInMegabytes
 *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
 *          stage. Its recommended you calculated this based on the number of cores and the free memory
 *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
 *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
 *          partial vectors without thrashing the system due to increased swapping
 * @throws IOException
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void createTermFrequencyVectors(Path input, Path output, Configuration baseConf, int minSupport,
        int maxNGramSize, float minLLRValue, int numReducers, int chunkSizeInMegabytes,
        boolean sequentialAccess) throws IOException, InterruptedException, ClassNotFoundException {
    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
        chunkSizeInMegabytes = MIN_CHUNKSIZE;
    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
        chunkSizeInMegabytes = MAX_CHUNKSIZE;
    }
    if (minSupport < 0) {
        minSupport = DEFAULT_MIN_SUPPORT;
    }

    Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER);

    int[] maxTermDimension = new int[1];
    List<Path> dictionaryChunks;
    if (maxNGramSize == 1) {
        startWordCounting(input, dictionaryJobPath, minSupport);
        dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath, output, chunkSizeInMegabytes,
                new LongWritable(), maxTermDimension);
    } else {
        CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue,
                numReducers);
        dictionaryChunks = createDictionaryChunks(minSupport,
                new Path(new Path(output, DICTIONARY_JOB_FOLDER), CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
                chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension);
    }

    int partialVectorIndex = 0;
    List<Path> partialVectorPaths = new ArrayList<Path>();
    for (Path dictionaryChunk : dictionaryChunks) {
        Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
        partialVectorPaths.add(partialVectorOutputPath);
        makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath, maxTermDimension[0],
                sequentialAccess, numReducers);
    }

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);

    Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
    if (dictionaryChunks.size() > 1) {
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
                sequentialAccess, numReducers);
        HadoopUtil.deletePaths(partialVectorPaths, fs);
    } else {
        Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
        fs.delete(outputDir, true);
        fs.rename(singlePartialVectorOutputPath, outputDir);
    }
}

From source file:org.apache.orc.mapred.OrcMapredRecordReader.java

License:Apache License

static DoubleWritable nextDouble(ColumnVector vector, int row, Object previous) {
    if (vector.isRepeating) {
        row = 0;/*from   ww w.j  a va  2s . c  om*/
    }
    if (vector.noNulls || !vector.isNull[row]) {
        DoubleWritable result;
        if (previous == null || previous.getClass() != DoubleWritable.class) {
            result = new DoubleWritable();
        } else {
            result = (DoubleWritable) previous;
        }
        result.set(((DoubleColumnVector) vector).vector[row]);
        return result;
    } else {
        return null;
    }
}

From source file:org.apache.orc.mapred.OrcStruct.java

License:Apache License

public static WritableComparable createValue(TypeDescription type) {
    switch (type.getCategory()) {
    case BOOLEAN:
        return new BooleanWritable();
    case BYTE://www  . j ava 2s  . com
        return new ByteWritable();
    case SHORT:
        return new ShortWritable();
    case INT:
        return new IntWritable();
    case LONG:
        return new LongWritable();
    case FLOAT:
        return new FloatWritable();
    case DOUBLE:
        return new DoubleWritable();
    case BINARY:
        return new BytesWritable();
    case CHAR:
    case VARCHAR:
    case STRING:
        return new Text();
    case DATE:
        return new DateWritable();
    case TIMESTAMP:
        return new OrcTimestamp();
    case DECIMAL:
        return new HiveDecimalWritable();
    case STRUCT: {
        OrcStruct result = new OrcStruct(type);
        int c = 0;
        for (TypeDescription child : type.getChildren()) {
            result.setFieldValue(c++, createValue(child));
        }
        return result;
    }
    case UNION:
        return new OrcUnion(type);
    case LIST:
        return new OrcList(type);
    case MAP:
        return new OrcMap(type);
    default:
        throw new IllegalArgumentException("Unknown type " + type);
    }
}

From source file:org.apache.sysml.runtime.util.MapReduceTool.java

License:Apache License

public static double[] pickValueWeight(String dir, MetaDataNumItemsByEachReducer metadata, double p,
        boolean average) throws IOException {
    long[] counts = metadata.getNumItemsArray();
    long[] ranges = new long[counts.length];
    ranges[0] = counts[0];//from   w  ww .  ja  v  a2s . c  o  m
    for (int i = 1; i < counts.length; i++)
        ranges[i] = ranges[i - 1] + counts[i];

    long total = ranges[ranges.length - 1];

    // do averaging only if it is asked for; and sum_wt is even
    average = average && (total % 2 == 0);

    int currentPart = 0;
    double cum_weight = 0;
    long pos = (long) Math.ceil(total * p);
    while (ranges[currentPart] < pos) {
        currentPart++;
        cum_weight += ranges[currentPart];
    }
    int offset;
    if (currentPart > 0)
        offset = (int) (pos - ranges[currentPart - 1] - 1);
    else
        offset = (int) pos - 1;

    Path path = new Path(dir);
    FileSystem fs = IOUtilFunctions.getFileSystem(path);
    FileStatus[] files = fs.listStatus(path);
    Path fileToRead = null;
    for (FileStatus file : files)
        if (file.getPath().toString().endsWith(Integer.toString(currentPart))) {
            fileToRead = file.getPath();
            break;
        }

    if (fileToRead == null)
        throw new RuntimeException("cannot read partition " + currentPart);

    int buffsz = 64 * 1024;
    DoubleWritable readKey = new DoubleWritable();
    IntWritable readValue = new IntWritable();
    FSDataInputStream currentStream = null;
    double ret = -1;
    try {
        currentStream = fs.open(fileToRead, buffsz);

        boolean contain0s = false;
        long numZeros = 0;
        if (currentPart == metadata.getPartitionOfZero()) {
            contain0s = true;
            numZeros = metadata.getNumberOfZero();
        }
        ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros);

        int numRead = 0;
        while (numRead <= offset) {
            reader.readNextKeyValuePairs(readKey, readValue);
            numRead += readValue.get();
            cum_weight += readValue.get();
        }

        ret = readKey.get();
        if (average) {
            if (numRead <= offset + 1) {
                reader.readNextKeyValuePairs(readKey, readValue);
                cum_weight += readValue.get();
                ret = (ret + readKey.get()) / 2;
            }
        }
    } finally {
        IOUtilFunctions.closeSilently(currentStream);
    }
    return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) };
}

From source file:org.kiji.examples.wikipediarank.RankRedistributor.java

License:Apache License

/** {@inheritDoc} */
@Override/*from   w w  w. j  a  v  a  2 s.  c  om*/
public void setup(GathererContext<Text, DoubleWritable> context) throws IOException {
    super.setup(context);
    mLink = new Text();
    mRank = new DoubleWritable();
}

From source file:org.mrgeo.pdf.TriangularDistributionPdfCurve.java

License:Apache License

public void writePdfCurve(Path output, Configuration conf) throws IOException {
    FileSystem fs = output.getFileSystem(conf);
    Path outputFile = new Path(output, "part-r-00000");
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outputFile, DoubleWritable.class,
            DoubleWritable.class);

    DoubleWritable key = new DoubleWritable();
    DoubleWritable value = new DoubleWritable();

    double resolution = (_max - _min) / _bin;
    double binNumber = _min;
    for (int i = 0; i < _bin; i++) {
        double likelihood = getLikelihood(binNumber);
        key.set(binNumber);/*from w w w  . j a  v  a  2  s.com*/
        value.set(likelihood);
        writer.append(key, value);
        binNumber += resolution;
    }

    writer.close();
}

From source file:org.mrgeo.pdf.TriangularDistributionPdfCurve.java

License:Apache License

private void _computeCurve(Path[] pdfFiles, Configuration conf) throws IOException {
    SequenceFile.Reader r = null;
    _likelihoods = new double[(int) _bin];
    int index = 0;
    try {//from  w w  w .  j av  a2  s . c  om
        // Loop through each of the output files from the reduce to process all of
        // the PDF histogram bins
        for (Path pdfFile : pdfFiles) {
            // ignore all the non-part files
            if (!pdfFile.getName().startsWith("part")) {
                continue;
            }
            r = new SequenceFile.Reader(pdfFile.getFileSystem(conf), pdfFile, conf);
            DoubleWritable key = new DoubleWritable();
            DoubleWritable value = new DoubleWritable();
            while (r.next(key, value)) {
                _likelihoods[index] = value.get();
                index++;
            }
        }
    } finally {
        IOUtils.closeStream(r);
    }
}

From source file:org.pentaho.hadoop.mapreduce.converter.converters.KettleTypeToDoubleWritableConverter.java

License:Apache License

@Override
public DoubleWritable convert(ValueMetaInterface meta, Object obj) throws TypeConversionException {
    try {/*from   w ww  .j ava 2s  .  c om*/
        DoubleWritable result = new DoubleWritable();
        result.set(meta.getNumber(obj));
        return result;
    } catch (KettleValueException ex) {
        throw new TypeConversionException(BaseMessages.getString(TypeConverterFactory.class, "ErrorConverting",
                DoubleWritable.class.getSimpleName(), obj), ex);
    }
}

From source file:org.shaf.core.util.IOUtils.java

License:Apache License

/**
 * Reads an {@link Object} of the specified type from the {@link DataInput}.
 * /*  ww w  . j  a v  a2  s.c o m*/
 * @param cls
 *            the type of the reading object.
 * @param in
 *            the data input stream.
 * @return the read object.
 * @throws IOException
 *             if I/O error occurs.
 */
public static final Object readObject(Class<?> cls, DataInput in) throws IOException {
    try {
        if (cls == null) {
            throw new IOException("Reading class is not defined: null.");
        } else if (ClassUtils.isBoolean(cls)) {
            BooleanWritable obj = new BooleanWritable();
            obj.readFields(in);
            return obj.get();
        } else if (ClassUtils.isByte(cls)) {
            ByteWritable obj = new ByteWritable();
            obj.readFields(in);
            return obj.get();
        } else if (ClassUtils.isShort(cls)) {
            ShortWritable obj = new ShortWritable();
            obj.readFields(in);
            return obj.get();
        } else if (ClassUtils.isInteger(cls)) {
            IntWritable obj = new IntWritable();
            obj.readFields(in);
            return obj.get();
        } else if (ClassUtils.isLong(cls)) {
            LongWritable obj = new LongWritable();
            obj.readFields(in);
            return obj.get();
        } else if (ClassUtils.isFloat(cls)) {
            FloatWritable obj = new FloatWritable();
            obj.readFields(in);
            return obj.get();
        } else if (ClassUtils.isDouble(cls)) {
            DoubleWritable obj = new DoubleWritable();
            obj.readFields(in);
            return obj.get();
        } else if (ClassUtils.isString(cls)) {
            return Text.readString(in);
        } else if (ClassUtils.isEnum(cls)) {
            IntWritable obj = new IntWritable();
            obj.readFields(in);
            return cls.getEnumConstants()[obj.get()];
        } else if (ClassUtils.isArray(cls)) {
            int length = (int) readObject(int.class, in);
            Object array = Array.newInstance(cls.getComponentType(), length);
            for (int j = 0; j < length; j++) {
                Object a = readObject(cls.getComponentType(), in);
                Array.set(array, j, a);
            }
            return array;
        } else {
            Object obj = cls.newInstance();
            ((Writable) obj).readFields(in);
            return obj;
        }
    } catch (IllegalArgumentException | InstantiationException | IllegalAccessException exc) {
        throw new IOException(exc);
    }
}

From source file:org.shaf.core.util.IOUtilsTest.java

License:Apache License

/**
 * Test writing of {@code double} value.
 *//* w  w  w .  ja va 2  s .  com*/
@Test
public void testWriteDouble() {
    byte[] buf = null;

    double value = 123.456;
    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream out = new DataOutputStream(baos);) {
        IOUtils.writeObject(value, out);
        buf = baos.toByteArray();
    } catch (IOException exc) {
        fail(exc.getMessage());
    }

    try (ByteArrayInputStream bais = new ByteArrayInputStream(buf);
            DataInputStream in = new DataInputStream(bais);) {
        DoubleWritable probe = new DoubleWritable();
        probe.readFields(in);
        assertEquals(value, probe.get(), 0.0001);
    } catch (IOException exc) {
        fail(exc.getMessage());
    }
}