Example usage for org.apache.mahout.utils.io ChunkedWriter ChunkedWriter

List of usage examples for org.apache.mahout.utils.io ChunkedWriter ChunkedWriter

Introduction

In this page you can find the example usage for org.apache.mahout.utils.io ChunkedWriter ChunkedWriter.

Prototype

public ChunkedWriter(Configuration conf, int chunkSizeInMB, Path output) throws IOException 

Source Link

Usage

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

private int runSequential(Configuration conf, Path input, Path output, Map<String, String> options)
        throws IOException, InterruptedException, NoSuchMethodException {
    // Running sequentially
    Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
    String keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
    FileSystem fs = FileSystem.get(input.toUri(), conf);
    ChunkedWriter writer = new ChunkedWriter(conf, Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])), output);

    try {/*from   w  w  w  .  j a v  a  2s .co m*/
        SequenceFilesFromDirectoryFilter pathFilter;
        String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]);
        if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
            pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options, writer, charset, fs);
        } else {
            pathFilter = ClassUtils.instantiateAs(fileFilterClassName, SequenceFilesFromDirectoryFilter.class,
                    new Class[] { Configuration.class, String.class, Map.class, ChunkedWriter.class,
                            Charset.class, FileSystem.class },
                    new Object[] { conf, keyPrefix, options, writer, charset, fs });
        }
        fs.listStatus(input, pathFilter);
    } finally {
        Closeables.close(writer, false);
    }
    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SequentialDataCopyJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 5) {
        printUsage();/*w  w  w  .  j  a  va 2  s . co m*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int chunkSizeInMB = Integer.valueOf(args[2]);
    String dataAPIConfClassName = args[3];
    int maxIdsPerReq = Integer.valueOf(args[4]);

    logger.info("SequentialDataCopyJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - chunkSizeInMB: " + chunkSizeInMB);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    Configuration conf = getConf();
    Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq);

    HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf);

    ChunkedWriter chunkWriter = new ChunkedWriter(getConf(), chunkSizeInMB, new Path(outputPath));

    Path input = new Path(inputPath);
    FileSystem fs = input.getFileSystem(conf);
    DataInputStream fsinput = new DataInputStream(fs.open(input));
    BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput));
    String line = null;
    int idNumThreshold = 100;
    int idNum = 0;
    StringBuilder idList = new StringBuilder();
    while ((line = reader.readLine()) != null) {
        idList.append(line + "|");
        if ((++idNum) >= idNumThreshold) {
            text2Seq(client.getID2Content(idList.toString()), chunkWriter);
            idList = new StringBuilder();
            idNum = 0;
        }
    }
    if (idList.length() > 0)
        text2Seq(client.getID2Content(idList.toString()), chunkWriter);

    chunkWriter.close();
    reader.close();

    return 0;
}