Example usage for org.apache.hadoop.io SequenceFile createWriter

List of usage examples for org.apache.hadoop.io SequenceFile createWriter

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile createWriter.

Prototype

@Deprecated
public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec, Metadata metadata) throws IOException 

Source Link

Document

Construct the preferred type of 'raw' SequenceFile Writer.

Usage

From source file:org.apache.sqoop.connector.hdfs.TestHdfsBase.java

License:Apache License

protected void createSequenceInput(String indir, Class<? extends CompressionCodec> clz, int numberOfFiles,
        int numberOfRows) throws IOException, InstantiationException, IllegalAccessException {
    Configuration conf = new Configuration();

    CompressionCodec codec = null;/*from   w  w  w  .  j  a v  a 2  s  .c o m*/
    if (clz != null) {
        codec = clz.newInstance();
        if (codec instanceof Configurable) {
            ((Configurable) codec).setConf(conf);
        }
    }

    int index = 1;
    for (int fi = 0; fi < numberOfFiles; fi++) {
        Path filepath = new Path(indir, UUID.randomUUID() + ".seq");
        SequenceFile.Writer filewriter;
        if (codec != null) {
            filewriter = SequenceFile.createWriter(filepath.getFileSystem(conf), conf, filepath, Text.class,
                    NullWritable.class, SequenceFile.CompressionType.BLOCK, codec);
        } else {
            filewriter = SequenceFile.createWriter(filepath.getFileSystem(conf), conf, filepath, Text.class,
                    NullWritable.class, SequenceFile.CompressionType.NONE);
        }

        Text text = new Text();
        for (int ri = 0; ri < numberOfRows; ri++) {
            String row = index + "," + (double) index + ",'" + index + "'";
            text.set(row);
            filewriter.append(text, NullWritable.get());
            index++;
        }

        filewriter.close();
    }
}

From source file:org.apache.tajo.storage.sequencefile.SequenceFileAppender.java

License:Apache License

@Override
public void init() throws IOException {
    os = new NonSyncByteArrayOutputStream(BUFFER_SIZE);

    this.fs = path.getFileSystem(conf);

    //determine the intermediate file type
    String store = conf.get(TajoConf.ConfVars.SHUFFLE_FILE_FORMAT.varname,
            TajoConf.ConfVars.SHUFFLE_FILE_FORMAT.defaultVal);
    if (enabledStats
            && CatalogProtos.StoreType.SEQUENCEFILE == CatalogProtos.StoreType.valueOf(store.toUpperCase())) {
        isShuffle = true;//from w  w w . j  av a2  s. c o  m
    } else {
        isShuffle = false;
    }

    this.delimiter = StringEscapeUtils.unescapeJava(this.meta.getOption(StorageConstants.SEQUENCEFILE_DELIMITER,
            StorageConstants.DEFAULT_FIELD_DELIMITER)).charAt(0);
    this.columnNum = schema.size();
    String nullCharacters = StringEscapeUtils
            .unescapeJava(this.meta.getOption(StorageConstants.SEQUENCEFILE_NULL, NullDatum.DEFAULT_TEXT));
    if (StringUtils.isEmpty(nullCharacters)) {
        nullChars = NullDatum.get().asTextBytes();
    } else {
        nullChars = nullCharacters.getBytes();
    }

    if (!fs.exists(path.getParent())) {
        throw new FileNotFoundException(path.toString());
    }

    if (this.meta.containsOption(StorageConstants.COMPRESSION_CODEC)) {
        String codecName = this.meta.getOption(StorageConstants.COMPRESSION_CODEC);
        codecFactory = new CompressionCodecFactory(conf);
        codec = codecFactory.getCodecByClassName(codecName);
    } else {
        if (fs.exists(path)) {
            throw new AlreadyExistsStorageException(path);
        }
    }

    try {
        String serdeClass = this.meta.getOption(StorageConstants.SEQUENCEFILE_SERDE,
                TextSerializerDeserializer.class.getName());
        serde = (SerializerDeserializer) Class.forName(serdeClass).newInstance();
    } catch (Exception e) {
        LOG.error(e.getMessage(), e);
        throw new IOException(e);
    }

    Class<? extends Writable> keyClass, valueClass;
    if (serde instanceof BinarySerializerDeserializer) {
        keyClass = BytesWritable.class;
        EMPTY_KEY = new BytesWritable();
        valueClass = BytesWritable.class;
    } else {
        keyClass = LongWritable.class;
        EMPTY_KEY = new LongWritable();
        valueClass = Text.class;
    }

    String type = this.meta.getOption(StorageConstants.COMPRESSION_TYPE, CompressionType.NONE.name());
    if (type.equals(CompressionType.BLOCK.name())) {
        writer = SequenceFile.createWriter(fs, conf, path, keyClass, valueClass, CompressionType.BLOCK, codec);
    } else if (type.equals(CompressionType.RECORD.name())) {
        writer = SequenceFile.createWriter(fs, conf, path, keyClass, valueClass, CompressionType.RECORD, codec);
    } else {
        writer = SequenceFile.createWriter(fs, conf, path, keyClass, valueClass, CompressionType.NONE, codec);
    }

    if (enabledStats) {
        this.stats = new TableStatistics(this.schema);
    }

    super.init();
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }//from  w  w  w  . j  a  v  a 2s.c om
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.springframework.data.hadoop.store.output.AbstractSequenceFileWriter.java

License:Apache License

/**
 * Gets the output./*from   ww  w . j  ava2 s .c o  m*/
 *
 * @return the output
 * @throws IOException Signals that an I/O exception has occurred.
 */
@SuppressWarnings("deprecation")
protected SequenceFileWriterHolder<Writer> getOutput() throws IOException {
    FileSystem fs = FileSystem.get(getConfiguration());

    SequenceFileWriterHolder<Writer> holder;
    Writer writer;
    CodecInfo codecInfo = getCodec();
    Path p = getResolvedPath();
    if (codecInfo == null) {
        writer = SequenceFile.createWriter(fs, getConfiguration(), getResolvedPath(), Text.class, Text.class,
                CompressionType.NONE, (CompressionCodec) null);
        holder = new SequenceFileWriterHolder<SequenceFile.Writer>(writer, p);
    } else {
        Class<?> clazz = ClassUtils.resolveClassName(codecInfo.getCodecClass(), getClass().getClassLoader());
        CompressionCodec compressionCodec = (CompressionCodec) ReflectionUtils.newInstance(clazz,
                getConfiguration());
        writer = SequenceFile.createWriter(fs, getConfiguration(), getResolvedPath(), Text.class, Text.class,
                CompressionType.RECORD, compressionCodec);
        holder = new SequenceFileWriterHolder<SequenceFile.Writer>(writer, p);
    }

    return holder;
}