List of usage examples for org.apache.hadoop.io SequenceFile createWriter
@Deprecated public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass, CompressionType compressionType, CompressionCodec codec) throws IOException
From source file:edu.ucsb.cs.partitioning.cosine.Organizer.java
License:Apache License
public static void readCombineCopy(Path input, String output, JobConf job) throws IOException { boolean printDist = job.getBoolean(Config.PRINT_DISTRIBUTION_PROPERTY, Config.PRINT_DISTRIBUTION_VALUE); BufferedWriter distout = null; SequenceFile.Writer out = null; if (printDist) distout = new BufferedWriter(new FileWriter("p-norm-distribution" + output)); int pc = 0, pr = 0; float pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE); FileSystem hdfs = input.getFileSystem(new JobConf()); FileStatus[] files = Partitioner.setFiles(hdfs, input); ArrayList<String> partitions = arrangeNames(files); for (int i = 0; i < partitions.size(); i++) { Path inputPath = new Path(input.toString() + "/" + partitions.get(i)); if (hdfs.isDirectory(inputPath)) continue; SequenceFile.Reader in = new SequenceFile.Reader(hdfs, inputPath, job); if (!isCombined(pr, pc, getRow(inputPath.getName()), getCol(inputPath.getName()), partitions)) { if (out != null) out.close();/*www . jav a 2s . co m*/ pr = getRow(inputPath.getName()); pc = getCol(inputPath.getName()); out = SequenceFile.createWriter(hdfs, job, new Path(output + "/" + inputPath.getName()), LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); } while (in.next(unused, document)) { out.append(new LongWritable(document.id), new FeatureWeightArrayWritable(document.vectorSize, document.vector)); if (printDist) distout.write(document.getPNorm(pChoice) + " \n"); } in.close(); } if (out != null) out.close(); }
From source file:edu.ucsb.cs.partitioning.cosine.Partitioner.java
License:Apache License
public static SequenceFile.Writer openFile(FileSystem hdfs, JobConf job, Path parent, int child) throws IOException { Path outputPath = new Path(parent + "/" + child); SequenceFile.Writer out = SequenceFile.createWriter(hdfs, job, outputPath, LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); return out;/*from w ww . jav a2 s.c o m*/ }
From source file:edu.ucsb.cs.partitioning.jaccard.JaccardCoarsePartitionMain.java
License:Apache License
public static SequenceFile.Writer openFile(FileSystem hdfs, Path parent, int child) throws IOException { return SequenceFile.createWriter(hdfs, new Configuration(), new Path(parent + "/" + child), LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); }
From source file:edu.ucsb.cs.preprocessing.sequence.SeqFilesCombiner.java
License:Apache License
public static void readDirWriteFile(String inputDir, String outputFile, int nLines) throws IOException { Configuration conf = new Configuration(); Path inputPath = new Path(inputDir); FileSystem fs = inputPath.getFileSystem(conf); int lineCount = 0; FileStatus[] files = fs.listStatus(inputPath); if (fs.exists(new Path(outputFile))) fs.delete(new Path(outputFile)); SequenceFile.Writer outFile = SequenceFile.createWriter(fs, conf, new Path(outputFile), LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); for (int i = 0; i < files.length; i++) { if (files[i].isDir() || files[i].getPath().getName().startsWith("_")) continue; Reader reader = new SequenceFile.Reader(fs, files[i].getPath(), conf); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); while (reader.next(key, value) && lineCount < nLines) { lineCount++;/*from ww w . j a va 2 s .c om*/ outFile.append(key, value); } reader.close(); if (lineCount >= nLines) break; } outFile.close(); System.out.println("Total lines copied: " + lineCount); }
From source file:eu.scape_project.tb.lsdr.seqfileutility.SequenceFileWriter.java
License:Apache License
@Override public void run() { try {/* www . ja va 2 s . c o m*/ FileSystem fs = FileSystem.get(URI.create(uri), conf); path = new Path(uri); Class keyClass = Text.class; Class valueClass = BytesWritable.class; if (pc.isTextlinemode()) { keyClass = Text.class; valueClass = Text.class; } writer = SequenceFile.createWriter(fs, conf, path, keyClass, valueClass, CompressionType.get(pc.getCompressionType())); traverseDir(rootDir); } catch (Exception e) { logger.error(this.getId() + ": " + "IOException occurred", e); } finally { IOUtils.closeStream(writer); } }
From source file:fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp.java
License:LGPL
/** * Initialize DFSCopyFileMapper specific job-configuration. * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments//w ww . j a v a2s .co m */ private static void setup(final Configuration conf, final JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); // set boolean values final boolean update = args.flags.contains(Options.UPDATE); final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE); jobConf.setBoolean(Options.UPDATE.propertyname, update); jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite); jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname, args.flags.contains(Options.IGNORE_READ_FAILURES)); jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP); FileSystem dstfs = args.dst.getFileSystem(conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_distcp_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (null == parent) { // If dst is '/' on S3, it might not exist yet, but dst.getParent() // will return null. In this case, use '/' as its own parent to // prevent // NPE errors below. parent = args.dst; } if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_distcp_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory OR we're updating/overwriting // the contents of the destination directory. final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite; int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { // skip file if the src and the dst files are the same. skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst)); // skip file if it exceed file limit or size limit skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit; if (!skipfile) { ++fileCount; byteCount += child.getLen(); // if (LOG.isTraceEnabled()) { // LOG.trace("adding file " + child.getPath()); // } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { getLogger().info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_distcp_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); if (dststatus != null && args.flags.contains(Options.DELETE)) { deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf); } Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); // Explicitly create the tmpDir to ensure that it can be cleaned // up by fullyDelete() later. tmpDir.getFileSystem(conf).mkdirs(tmpDir); getLogger().info("srcCount=" + srcCount); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(byteCount, jobConf); }
From source file:fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp.java
License:LGPL
/** Delete the dst files/dirs which do not exist in src */ static private void deleteNonexisting(final FileSystem dstfs, final FileStatus dstroot, final Path dstsorted, final FileSystem jobfs, final Path jobdir, final JobConf jobconf, final Configuration conf) throws IOException { if (!dstroot.isDir()) { throw new IOException("dst must be a directory when option " + Options.DELETE.cmd + " is set, but dst (= " + dstroot.getPath() + ") is not a directory."); }/*from w w w. j a v a 2 s . c o m*/ // write dst lsr results final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr"); final SequenceFile.Writer writer = SequenceFile.createWriter(jobfs, jobconf, dstlsr, Text.class, dstroot.getClass(), SequenceFile.CompressionType.NONE); try { // do lsr to get all file statuses in dstroot final Stack<FileStatus> lsrstack = new Stack<>(); for (lsrstack.push(dstroot); !lsrstack.isEmpty();) { final FileStatus status = lsrstack.pop(); if (status.isDir()) { for (FileStatus child : dstfs.listStatus(status.getPath())) { String relative = makeRelative(dstroot.getPath(), child.getPath()); writer.append(new Text(relative), child); lsrstack.push(child); } } } } finally { checkAndClose(writer); } // sort lsr results final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted"); SequenceFile.Sorter sorter = new SequenceFile.Sorter(jobfs, new Text.Comparator(), Text.class, FileStatus.class, jobconf); sorter.sort(dstlsr, sortedlsr); // compare lsr list and dst list SequenceFile.Reader lsrin = null; SequenceFile.Reader dstin = null; try { lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf); dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf); // compare sorted lsr list and sorted dst list final Text lsrpath = new Text(); final FileStatus lsrstatus = new FileStatus(); final Text dstpath = new Text(); final Text dstfrom = new Text(); final FsShell shell = new FsShell(conf); final String[] shellargs = { "-rmr", null }; boolean hasnext = dstin.next(dstpath, dstfrom); for (; lsrin.next(lsrpath, lsrstatus);) { int dst_cmp_lsr = dstpath.compareTo(lsrpath); for (; hasnext && dst_cmp_lsr < 0;) { hasnext = dstin.next(dstpath, dstfrom); dst_cmp_lsr = dstpath.compareTo(lsrpath); } if (dst_cmp_lsr == 0) { // lsrpath exists in dst, skip it hasnext = dstin.next(dstpath, dstfrom); } else { // lsrpath does not exist, delete it String s = new Path(dstroot.getPath(), lsrpath.toString()).toString(); if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) { shellargs[1] = s; int r = 0; try { r = shell.run(shellargs); } catch (Exception e) { throw new IOException("Exception from shell.", e); } if (r != 0) { throw new IOException( "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r); } } } } } finally { checkAndClose(lsrin); checkAndClose(dstin); } }
From source file:functionaltests.ext.mapreduce.TestMapReduce3Jobs.java
License:Apache License
@org.junit.Test public void run() throws Exception { Path TEST_ROOT_DIR = new Path(System.getProperty("java.io.tmpdir") + File.separator + "TestMapReduce3Jobs"); fs.delete(TEST_ROOT_DIR, true);/* www . j a va 2 s .com*/ // // Generate distribution of ints. This is the answer key. // Configuration conf = new Configuration(); int countsToGo = counts; int dist[] = new int[range]; for (int i = 0; i < range; i++) { double avgInts = (1.0 * countsToGo) / (range - i); dist[i] = (int) Math.max(0, Math.round(avgInts + (Math.sqrt(avgInts) * r.nextGaussian()))); countsToGo -= dist[i]; } if (countsToGo > 0) { dist[dist.length - 1] += countsToGo; } // // Write the answer key to a file. // if (!fs.mkdirs(TEST_ROOT_DIR)) { throw new IOException("Mkdirs failed to create " + TEST_ROOT_DIR.toString()); } Path randomInsRel = new Path("genins"); Path randomIns = new Path(TEST_ROOT_DIR, randomInsRel); if (!fs.mkdirs(randomIns)) { throw new IOException("Mkdirs failed to create " + randomIns.toString()); } Path answerkeyRel = new Path("answer.key"); Path answerkey = new Path(randomIns, answerkeyRel); SequenceFile.Writer out = SequenceFile.createWriter(fs, conf, answerkey, IntWritable.class, IntWritable.class, SequenceFile.CompressionType.NONE); try { for (int i = 0; i < range; i++) { out.append(new IntWritable(i), new IntWritable(dist[i])); } } finally { out.close(); } printFiles(randomIns, conf); // // Now we need to generate the random numbers according to // the above distribution. // // We create a lot of map tasks, each of which takes at least // one "line" of the distribution. (That is, a certain number // X is to be generated Y number of times.) // // A map task emits Y key/val pairs. The val is X. The key // is a randomly-generated number. // // The reduce task gets its input sorted by key. That is, sorted // in random order. It then emits a single line of text that // for the given values. It does not emit the key. // // Because there's just one reduce task, we emit a single big // file of random numbers. // Path randomOutsRel = new Path("genouts"); Path randomOuts = new Path(TEST_ROOT_DIR, randomOutsRel); fs.delete(randomOuts, true); fs.mkdirs(randomOuts); Job genJob = new Job(conf, "gen job"); // FileInputFormat.setInputPaths(genJob, randomIns); genJob.setInputFormatClass(SequenceFileInputFormat.class); genJob.setMapperClass(RandomGenMapper.class); // genJob.setMapperClass(TokenizerMapper.class); FileInputFormat.addInputPath(genJob, answerkeyRel); FileOutputFormat.setOutputPath(genJob, randomOutsRel); // FileOutputFormat.setOutputPath(genJob, randomOuts); genJob.setOutputKeyClass(IntWritable.class); genJob.setOutputValueClass(IntWritable.class); // genJob.setOutputFormatClass(SequenceFileOutputFormat.class); genJob.setReducerClass(RandomGenReducer.class); genJob.setNumReduceTasks(1); PAMapReduceJobConfiguration pamrjc = MapReduceTHelper.getConfiguration(); pamrjc.setInputSpace("file://" + randomIns); pamrjc.setOutputSpace("file://" + TEST_ROOT_DIR); MapReduceTHelper.submit(genJob, pamrjc); printFiles(randomOuts, conf); // // Next, we read the big file in and regenerate the // original map. It's split into a number of parts. // (That number is 'intermediateReduces'.) // // We have many map tasks, each of which read at least one // of the output numbers. For each number read in, the // map task emits a key/value pair where the key is the // number and the value is "1". // // We have a single reduce task, which receives its input // sorted by the key emitted above. For each key, there will // be a certain number of "1" values. The reduce task sums // these values to compute how many times the given key was // emitted. // // The reduce task then emits a key/val pair where the key // is the number in question, and the value is the number of // times the key was emitted. This is the same format as the // original answer key (except that numbers emitted zero times // will not appear in the regenerated key.) The answer set // is split into a number of pieces. A final MapReduce job // will merge them. // // There's not really a need to go to 10 reduces here // instead of 1. But we want to test what happens when // you have multiple reduces at once. // int intermediateReduces = 10; Path intermediateOutsRel = new Path("intermediateouts"); Path intermediateOuts = new Path(TEST_ROOT_DIR, intermediateOutsRel); fs.delete(intermediateOuts, true); conf = new Configuration(); Job checkJob = new Job(conf, "check job"); // FileInputFormat.setInputPaths(checkJob, randomOuts); FileInputFormat.setInputPaths(checkJob, randomOutsRel); checkJob.setMapperClass(RandomCheckMapper.class); // checkJob.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(checkJob, intermediateOutsRel); checkJob.setOutputKeyClass(IntWritable.class); checkJob.setOutputValueClass(IntWritable.class); checkJob.setOutputFormatClass(SequenceFileOutputFormat.class); checkJob.setReducerClass(RandomCheckReducer.class); checkJob.setNumReduceTasks(intermediateReduces); pamrjc = MapReduceTHelper.getConfiguration(); pamrjc.setInputSpace("file://" + TEST_ROOT_DIR); pamrjc.setOutputSpace("file://" + TEST_ROOT_DIR); MapReduceTHelper.submit(checkJob, pamrjc); printFiles(intermediateOuts, conf); // // OK, now we take the output from the last job and // merge it down to a single file. The map() and reduce() // functions don't really do anything except reemit tuples. // But by having a single reduce task here, we end up merging // all the files. // Path finalOutsRel = new Path("finalouts"); Path finalOuts = new Path(TEST_ROOT_DIR, finalOutsRel); fs.delete(finalOuts, true); Job mergeJob = new Job(conf, "merge job"); FileInputFormat.setInputPaths(mergeJob, intermediateOutsRel); mergeJob.setInputFormatClass(SequenceFileInputFormat.class); mergeJob.setMapperClass(MergeMapper.class); FileOutputFormat.setOutputPath(mergeJob, finalOutsRel); mergeJob.setOutputKeyClass(IntWritable.class); mergeJob.setOutputValueClass(IntWritable.class); mergeJob.setOutputFormatClass(SequenceFileOutputFormat.class); mergeJob.setReducerClass(MergeReducer.class); mergeJob.setNumReduceTasks(1); pamrjc = MapReduceTHelper.getConfiguration(); pamrjc.setInputSpace("file://" + TEST_ROOT_DIR); pamrjc.setOutputSpace("file://" + TEST_ROOT_DIR); MapReduceTHelper.submit(mergeJob, pamrjc); printFiles(finalOuts, conf); // // Finally, we compare the reconstructed answer key with the // original one. Remember, we need to ignore zero-count items // in the original key. // boolean success = true; try { File dir = new File(finalOuts.toString()); System.out.println(finalOuts.toString()); System.out.println(dir); String filename = dir.list()[0]; Path recomputedkey = new Path(finalOuts, filename); System.out.println("++++++++++++++++ Path to recomputed key: " + recomputedkey); SequenceFile.Reader in = new SequenceFile.Reader(fs, recomputedkey, conf); int totalseen = 0; try { IntWritable key = new IntWritable(); IntWritable val = new IntWritable(); for (int i = 0; i < range; i++) { if (dist[i] == 0) { continue; } if (!in.next(key, val)) { System.err.println("Cannot read entry " + i); success = false; break; } else { if (!((key.get() == i) && (val.get() == dist[i]))) { System.err.println("Mismatch! Pos=" + key.get() + ", i=" + i + ", val=" + val.get() + ", dist[i]=" + dist[i]); success = false; } totalseen += val.get(); } } if (success) { if (in.next(key, val)) { System.err.println("Unnecessary lines in recomputed key!"); success = false; } } } finally { in.close(); } int originalTotal = 0; for (int i = 0; i < dist.length; i++) { originalTotal += dist[i]; } System.out.println("Original sum: " + originalTotal); System.out.println("Recomputed sum: " + totalseen); // // Write to "results" whether the test succeeded or not. // Path resultFile = new Path(TEST_ROOT_DIR, "results"); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fs.create(resultFile))); try { bw.write("Success=" + success + "\n"); System.out.println("Success=" + success); } finally { bw.close(); } Assert.assertTrue("Test failed", success); fs.delete(TEST_ROOT_DIR, true); } catch (Throwable e) { Assert.assertTrue("Unexpected exception; test failed", false); e.printStackTrace(); } }
From source file:hk.newsRecommender.MatrixAndCluster.java
License:Open Source License
public static void matrix2Vector(Configuration conf, Path path) throws IOException { FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = null; // ??SequenceFile????Name?? reader = new SequenceFile.Reader(fs, path, conf); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable val = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); Writer writer = null;/*from w ww . ja v a 2s . c o m*/ try { writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class, CompressionType.BLOCK); final IntWritable key1 = new IntWritable(); final VectorWritable value = new VectorWritable(); int lineNum = 0; Vector vector = null; while (reader.next(key, val)) { int index = 0; StringTokenizer st = new StringTokenizer(val.toString()); // SequentialAccessSparseVector??NamedVector vector = new NamedVector(new SequentialAccessSparseVector(Cardinality), lineNum + ""); while (st.hasMoreTokens()) { if (Integer.parseInt(st.nextToken()) == 1) { vector.set(index, 1); } index++; } key1.set(lineNum++); value.set(vector); writer.append(key, value); } } finally { writer.close(); reader.close(); } }
From source file:io.covert.binary.analysis.BuildSequenceFile.java
License:Apache License
@Override public int run(String[] args) throws Exception { File inDir = new File(args[0]); Path name = new Path(args[1]); Text key = new Text(); BytesWritable val = new BytesWritable(); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, name, Text.class, BytesWritable.class, CompressionType.RECORD);/*from ww w . j a va 2 s . co m*/ for (File file : inDir.listFiles()) { if (!file.isFile()) { System.out.println("Skipping " + file + " (not a file) ..."); continue; } FileInputStream fileIn = new FileInputStream(file); ByteArrayOutputStream bytesOut = new ByteArrayOutputStream((int) file.length()); int b; while (-1 != (b = fileIn.read())) { bytesOut.write(b); } fileIn.close(); bytesOut.close(); byte[] bytes = bytesOut.toByteArray(); val.set(bytes, 0, bytes.length); key.set(file.getName()); writer.append(key, val); } writer.close(); return 0; }