Example usage for org.apache.hadoop.io SequenceFile createWriter

List of usage examples for org.apache.hadoop.io SequenceFile createWriter

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile createWriter.

Prototype

@Deprecated
public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec) throws IOException 

Source Link

Document

Construct the preferred type of 'raw' SequenceFile Writer.

Usage

From source file:org.apache.hama.ml.kmeans.KMeansBSP.java

License:Apache License

private void recalculateAssignmentsAndWrite(
        BSPPeer<VectorWritable, NullWritable, IntWritable, VectorWritable, CenterMessage> peer)
        throws IOException {
    final NullWritable value = NullWritable.get();
    // also use our cache to speed up the final writes if exists
    if (cache == null) {
        final VectorWritable key = new VectorWritable();
        IntWritable keyWrite = new IntWritable();
        while (peer.readNext(key, value)) {
            final int lowestDistantCenter = getNearestCenter(key.getVector());
            keyWrite.set(lowestDistantCenter);
            peer.write(keyWrite, key);//from  w w  w .  ja  va 2  s .c om
        }
    } else {
        IntWritable keyWrite = new IntWritable();
        for (DoubleVector v : cache) {
            final int lowestDistantCenter = getNearestCenter(v);
            keyWrite.set(lowestDistantCenter);
            peer.write(keyWrite, new VectorWritable(v));
        }
    }
    // just on the first task write the centers to filesystem to prevent
    // collisions
    if (peer.getPeerName().equals(peer.getPeerName(0))) {
        String pathString = conf.get(CENTER_OUT_PATH);
        if (pathString != null) {
            final SequenceFile.Writer dataWriter = SequenceFile.createWriter(FileSystem.get(conf), conf,
                    new Path(pathString), VectorWritable.class, NullWritable.class, CompressionType.NONE);
            for (DoubleVector center : centers) {
                dataWriter.append(new VectorWritable(center), value);
            }
            dataWriter.close();
        }
    }
}

From source file:org.apache.hama.ml.kmeans.KMeansBSP.java

License:Apache License

/**
 * Reads input text files and writes it to a sequencefile.
 * /*  www.ja  va  2s. c o m*/
 * @param k
 * @param conf
 * @param txtIn
 * @param center
 * @param out
 * @param fs
 * @param hasKey true if first column is required to be the key.
 * @return the path of a sequencefile.
 * @throws IOException
 */
public static Path prepareInputText(int k, Configuration conf, Path txtIn, Path center, Path out, FileSystem fs,
        boolean hasKey) throws IOException {

    Path in;
    if (fs.isFile(txtIn)) {
        in = new Path(txtIn.getParent(), "textinput/in.seq");
    } else {
        in = new Path(txtIn, "textinput/in.seq");
    }

    if (fs.exists(out))
        fs.delete(out, true);

    if (fs.exists(center))
        fs.delete(center, true);

    if (fs.exists(in))
        fs.delete(in, true);

    final NullWritable value = NullWritable.get();

    Writer centerWriter = new SequenceFile.Writer(fs, conf, center, VectorWritable.class, NullWritable.class);

    final SequenceFile.Writer dataWriter = SequenceFile.createWriter(fs, conf, in, VectorWritable.class,
            NullWritable.class, CompressionType.NONE);

    int i = 0;

    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(txtIn)));
    String line;
    while ((line = br.readLine()) != null) {
        String[] split = line.split("\t");
        int columnLength = split.length;
        int indexPos = 0;
        if (hasKey) {
            columnLength = columnLength - 1;
            indexPos++;
        }

        DenseDoubleVector vec = new DenseDoubleVector(columnLength);
        for (int j = 0; j < columnLength; j++) {
            vec.set(j, Double.parseDouble(split[j + indexPos]));
        }

        VectorWritable vector;
        if (hasKey) {
            NamedDoubleVector named = new NamedDoubleVector(split[0], vec);
            vector = new VectorWritable(named);
        } else {
            vector = new VectorWritable(vec);
        }

        dataWriter.append(vector, value);
        if (k > i) {
            centerWriter.append(vector, value);
        }
        i++;
    }
    br.close();
    centerWriter.close();
    dataWriter.close();
    return in;
}

From source file:org.apache.hama.ml.kmeans.KMeansBSP.java

License:Apache License

/**
 * Create some random vectors as input and assign the first k vectors as
 * intial centers.// w w w  .j av  a 2 s .c o  m
 */
public static void prepareInput(int count, int k, int dimension, Configuration conf, Path in, Path center,
        Path out, FileSystem fs) throws IOException {
    if (fs.exists(out))
        fs.delete(out, true);

    if (fs.exists(center))
        fs.delete(center, true);

    if (fs.exists(in))
        fs.delete(in, true);

    final SequenceFile.Writer centerWriter = SequenceFile.createWriter(fs, conf, center, VectorWritable.class,
            NullWritable.class, CompressionType.NONE);
    final NullWritable value = NullWritable.get();

    final SequenceFile.Writer dataWriter = SequenceFile.createWriter(fs, conf, in, VectorWritable.class,
            NullWritable.class, CompressionType.NONE);

    Random r = new Random();
    for (int i = 0; i < count; i++) {

        double[] arr = new double[dimension];
        for (int d = 0; d < dimension; d++) {
            arr[d] = r.nextInt(count);
        }
        VectorWritable vector = new VectorWritable(new DenseDoubleVector(arr));
        dataWriter.append(vector, value);
        if (k > i) {
            centerWriter.append(vector, value);
        }
    }
    centerWriter.close();
    dataWriter.close();
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.qr.QRFirstStep.java

License:Apache License

private SequenceFile.Writer getTempQw() throws IOException {
    if (tempQw == null) {
        /*// w  w  w  .  j  a v a2s.  c o  m
         * temporary Q output hopefully will not exceed size of IO cache in which
         * case it is only good since it is going to be managed by kernel, not
         * java GC. And if IO cache is not good enough, then at least it is always
         * sequential.
         */
        String taskTmpDir = System.getProperty("java.io.tmpdir");

        FileSystem localFs = FileSystem.getLocal(jobConf);
        Path parent = new Path(taskTmpDir);
        Path sub = new Path(parent, "qw_" + System.currentTimeMillis());
        tempQPath = new Path(sub, "q-temp.seq");
        tempQw = SequenceFile.createWriter(localFs, jobConf, tempQPath, IntWritable.class,
                DenseBlockWritable.class, CompressionType.BLOCK);
        closeables.addFirst(tempQw);
        closeables.addFirst(new IOUtils.DeleteFileOnClose(new File(tempQPath.toString())));
    }
    return tempQw;
}

From source file:org.apache.mrql.Bag.java

License:Apache License

/** add a new value to a Bag (cache it in memory if necessary)
 * @param x the new value//  w ww.  j  a v a 2  s  . c  o m
 */
public void add(final MRData x) {
    materialize();
    if (!spilled() && Config.hadoop_mode && size() >= Config.max_materialized_bag)
        spill();
    if (spilled())
        try {
            if (writer == null) { // writer was closed earlier for reading
                FileSystem fs = FileSystem.getLocal(Plan.conf);
                writer = SequenceFile.createWriter(fs, Plan.conf, new Path(path), MRContainer.class,
                        NullWritable.class, SequenceFile.CompressionType.NONE);
                System.err.println("*** Appending elements to a spilled Bag: " + path);
            }
            ;
            writer.append(new MRContainer(x), NullWritable.get());
        } catch (IOException e) {
            throw new Error("Cannot append an element to a spilled Bag: " + path);
        }
    else
        content.add(x);
}

From source file:org.apache.mrql.Bag.java

License:Apache License

/** spill the Bag to a local file */
private void spill() {
    if (!spilled() && Config.hadoop_mode)
        try {//w w w .j a v  a 2 s . c  o  m
            if (Plan.conf == null)
                Plan.conf = Evaluator.evaluator.new_configuration();
            final FileSystem fs = FileSystem.getLocal(Plan.conf);
            path = new_path(fs);
            System.err.println("*** Spilling a Bag to a local file: " + path);
            final Path p = new Path(path);
            writer = SequenceFile.createWriter(fs, Plan.conf, new Path(path), MRContainer.class,
                    NullWritable.class, SequenceFile.CompressionType.NONE);
            for (MRData e : this)
                writer.append(new MRContainer(e), NullWritable.get());
            mode = Modes.SPILLED;
            content = null;
            iterator = null;
        } catch (Exception e) {
            throw new Error("Cannot spill a Bag to a local file");
        }
}

From source file:org.apache.mrql.Plan.java

License:Apache License

/** splits the range min..max into multiple ranges, one for each mapper */
public final static DataSet generator(int source_num, long min, long max, long split_length) throws Exception {
    if (min > max)
        throw new Error("Wrong range: " + min + "..." + max);
    if (split_length < 1)
        if (Config.bsp_mode)
            split_length = (max - min) / Config.nodes + 1;
        else//  ww w .j a  va  2 s  .  com
            split_length = Config.range_split_size;
    DataSet ds = new DataSet(0, 0);
    long i = min;
    while (i + split_length <= max) {
        String file = new_path(conf);
        Path path = new Path(file);
        SequenceFile.Writer writer = SequenceFile.createWriter(path.getFileSystem(conf), conf, path,
                MRContainer.class, MRContainer.class, SequenceFile.CompressionType.NONE);
        writer.append(new MRContainer(new MR_long(i)),
                new MRContainer(new Tuple(new MR_long(i), new MR_long(split_length))));
        writer.close();
        ds.source.add(new GeneratorDataSource(source_num, file, conf));
        i += split_length;
    }
    ;
    if (i <= max) {
        String file = new_path(conf);
        Path path = new Path(file);
        SequenceFile.Writer writer = SequenceFile.createWriter(path.getFileSystem(conf), conf, path,
                MRContainer.class, MRContainer.class, SequenceFile.CompressionType.NONE);
        writer.append(new MRContainer(new MR_long(i)),
                new MRContainer(new Tuple(new MR_long(i), new MR_long(max - i + 1))));
        writer.close();
        ds.source.add(new GeneratorDataSource(source_num, file, conf));
    }
    ;
    return ds;
}

From source file:org.apache.tez.dag.history.logging.proto.ProtoMessageWriter.java

License:Apache License

ProtoMessageWriter(Configuration conf, Path filePath, Parser<T> parser) throws IOException {
    this.filePath = filePath;
    this.writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(filePath),
            SequenceFile.Writer.keyClass(NullWritable.class),
            SequenceFile.Writer.valueClass(ProtoMessageWritable.class),
            SequenceFile.Writer.appendIfExists(true), SequenceFile.Writer.compression(CompressionType.RECORD));
    this.writable = new ProtoMessageWritable<>(parser);
}

From source file:org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter.java

License:Open Source License

public SequenceFileSpillWriter(FileSystem fileSystem, Configuration conf, Path outputFilePath,
        Class<KeyType> keyClass, Class<ValueType> valueClass,
        SequenceFileIndexWriter<KeyType, ValueType> optionalIndexWriter, boolean compress) throws IOException {

    _indexWriter = optionalIndexWriter;//  w ww. ja v a 2 s  .  c om
    _spillBufferSize = conf.getInt(SPILL_WRITER_BUFFER_SIZE_PARAM, DEFAULT_SPILL_BUFFER_SIZE);
    _outputStream = fileSystem.create(outputFilePath);

    // allocate buffer ...
    _activeBuffer = ByteBuffer.allocate(_spillBufferSize);

    if (compress) {
        Class codecClass = conf.getClass("mapred.output.compression.codec", DefaultCodec.class);
        CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);

        writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.BLOCK,
                codec);
    } else {
        writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.NONE,
                null);
    }

    _writerThread = new Thread(new Runnable() {

        @Override
        public void run() {
            // LOG.info("Writer Thread Starting");

            while (true) {

                QueuedBufferItem queuedBufferItem = null;

                try {
                    queuedBufferItem = _bufferQueue.take();
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                if (queuedBufferItem._buffer == null) {
                    // LOG.info("Writer Thread received empty buffer item. Exiting");
                    return;
                } else {

                    ByteBuffer theBuffer = queuedBufferItem._buffer;

                    // LOG.info("Writer Thread received item. Limit:" +
                    // theBuffer.limit());

                    // get byte pointer
                    byte[] bufferAsBytes = theBuffer.array();

                    int itemsWritten = 0;
                    long timeStart = System.currentTimeMillis();

                    while (theBuffer.remaining() != 0) {

                        // now read in key length
                        int keyLen = theBuffer.getInt();
                        // mark key position
                        int keyPos = theBuffer.position();
                        // now skip past key length
                        theBuffer.position(keyPos + keyLen);
                        // read value length
                        int valueLen = theBuffer.getInt();
                        // mark value position
                        int valuePosition = theBuffer.position();
                        // now skip past it ...
                        theBuffer.position(valuePosition + valueLen);
                        // now write this out to the sequence file ...

                        try {
                            spillRawRecord2(bufferAsBytes, keyPos, keyLen, bufferAsBytes, valuePosition,
                                    valueLen);
                        } catch (IOException e) {
                            LOG.error("Writer Thread Failed with Error:" + StringUtils.stringifyException(e));
                            _writerException = e;
                            return;
                        }
                        itemsWritten++;
                    }
                    // LOG.info("Writer Thread Finished With Buffer. Wrote:"+
                    // itemsWritten + " in:" + (System.currentTimeMillis() -
                    // timeStart));
                }
            }
        }

    });
    _writerThread.start();
}

From source file:org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java

License:Open Source License

@Override
public void configure(JobConf job) {

    HashSet<Integer> onlyDoPartitions = null;
    String hack = job.get("hack");
    if (hack != null) {
        onlyDoPartitions = new HashSet<Integer>();
        JsonParser parser = new JsonParser();
        JsonArray hackArray = parser.parse(hack).getAsJsonArray();
        for (JsonElement element : hackArray) {
            onlyDoPartitions.add(element.getAsInt());
        }//www.j  a  v  a2s  .c  o  m
    }
    _conf = job;
    try {
        _fs = FileSystem.get(_conf);
        int partitionId = _conf.getInt("mapred.task.partition", 0);
        if (onlyDoPartitions == null || onlyDoPartitions.contains(partitionId)) {
            Path redirectPath = new Path(FileOutputFormat.getWorkOutputPath(_conf),
                    "redirect-" + NUMBER_FORMAT.format(partitionId));
            _redirectWriter = SequenceFile.createWriter(_fs, _conf, redirectPath, TextBytes.class,
                    TextBytes.class, CompressionType.BLOCK);
        } else {
            _skipPartition = true;
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}