List of usage examples for org.apache.mahout.utils.io ChunkedWriter ChunkedWriter
public ChunkedWriter(Configuration conf, int chunkSizeInMB, Path output) throws IOException
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
private int runSequential(Configuration conf, Path input, Path output, Map<String, String> options) throws IOException, InterruptedException, NoSuchMethodException { // Running sequentially Charset charset = Charset.forName(getOption(CHARSET_OPTION[0])); String keyPrefix = getOption(KEY_PREFIX_OPTION[0]); FileSystem fs = FileSystem.get(input.toUri(), conf); ChunkedWriter writer = new ChunkedWriter(conf, Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])), output); try {/*from w w w . j a v a 2s .co m*/ SequenceFilesFromDirectoryFilter pathFilter; String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]); if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) { pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options, writer, charset, fs); } else { pathFilter = ClassUtils.instantiateAs(fileFilterClassName, SequenceFilesFromDirectoryFilter.class, new Class[] { Configuration.class, String.class, Map.class, ChunkedWriter.class, Charset.class, FileSystem.class }, new Object[] { conf, keyPrefix, options, writer, charset, fs }); } fs.listStatus(input, pathFilter); } finally { Closeables.close(writer, false); } return 0; }
From source file:edu.indiana.d2i.htrc.io.SequentialDataCopyJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 5) { printUsage();/*w w w . j a va 2 s . co m*/ } String inputPath = args[0]; String outputPath = args[1]; int chunkSizeInMB = Integer.valueOf(args[2]); String dataAPIConfClassName = args[3]; int maxIdsPerReq = Integer.valueOf(args[4]); logger.info("SequentialDataCopyJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - chunkSizeInMB: " + chunkSizeInMB); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); Configuration conf = getConf(); Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq); HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf); ChunkedWriter chunkWriter = new ChunkedWriter(getConf(), chunkSizeInMB, new Path(outputPath)); Path input = new Path(inputPath); FileSystem fs = input.getFileSystem(conf); DataInputStream fsinput = new DataInputStream(fs.open(input)); BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput)); String line = null; int idNumThreshold = 100; int idNum = 0; StringBuilder idList = new StringBuilder(); while ((line = reader.readLine()) != null) { idList.append(line + "|"); if ((++idNum) >= idNumThreshold) { text2Seq(client.getID2Content(idList.toString()), chunkWriter); idList = new StringBuilder(); idNum = 0; } } if (idList.length() > 0) text2Seq(client.getID2Content(idList.toString()), chunkWriter); chunkWriter.close(); reader.close(); return 0; }