Example usage for org.apache.hadoop.io SequenceFile createWriter

List of usage examples for org.apache.hadoop.io SequenceFile createWriter

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile createWriter.

Prototype

@Deprecated
public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec) throws IOException 

Source Link

Document

Construct the preferred type of 'raw' SequenceFile Writer.

Usage

From source file:edu.ucsb.cs.partitioning.cosine.Organizer.java

License:Apache License

public static void readCombineCopy(Path input, String output, JobConf job) throws IOException {
    boolean printDist = job.getBoolean(Config.PRINT_DISTRIBUTION_PROPERTY, Config.PRINT_DISTRIBUTION_VALUE);
    BufferedWriter distout = null;
    SequenceFile.Writer out = null;
    if (printDist)
        distout = new BufferedWriter(new FileWriter("p-norm-distribution" + output));

    int pc = 0, pr = 0;
    float pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE);
    FileSystem hdfs = input.getFileSystem(new JobConf());
    FileStatus[] files = Partitioner.setFiles(hdfs, input);
    ArrayList<String> partitions = arrangeNames(files);

    for (int i = 0; i < partitions.size(); i++) {
        Path inputPath = new Path(input.toString() + "/" + partitions.get(i));
        if (hdfs.isDirectory(inputPath))
            continue;

        SequenceFile.Reader in = new SequenceFile.Reader(hdfs, inputPath, job);
        if (!isCombined(pr, pc, getRow(inputPath.getName()), getCol(inputPath.getName()), partitions)) {
            if (out != null)
                out.close();/*www .  jav a  2s  .  co m*/
            pr = getRow(inputPath.getName());
            pc = getCol(inputPath.getName());
            out = SequenceFile.createWriter(hdfs, job, new Path(output + "/" + inputPath.getName()),
                    LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);
        }
        while (in.next(unused, document)) {
            out.append(new LongWritable(document.id),
                    new FeatureWeightArrayWritable(document.vectorSize, document.vector));
            if (printDist)
                distout.write(document.getPNorm(pChoice) + " \n");
        }
        in.close();
    }
    if (out != null)
        out.close();
}

From source file:edu.ucsb.cs.partitioning.cosine.Partitioner.java

License:Apache License

public static SequenceFile.Writer openFile(FileSystem hdfs, JobConf job, Path parent, int child)
        throws IOException {
    Path outputPath = new Path(parent + "/" + child);
    SequenceFile.Writer out = SequenceFile.createWriter(hdfs, job, outputPath, LongWritable.class,
            FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);
    return out;/*from  w ww . jav a2  s.c o  m*/
}

From source file:edu.ucsb.cs.partitioning.jaccard.JaccardCoarsePartitionMain.java

License:Apache License

public static SequenceFile.Writer openFile(FileSystem hdfs, Path parent, int child) throws IOException {
    return SequenceFile.createWriter(hdfs, new Configuration(), new Path(parent + "/" + child),
            LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);
}

From source file:edu.ucsb.cs.preprocessing.sequence.SeqFilesCombiner.java

License:Apache License

public static void readDirWriteFile(String inputDir, String outputFile, int nLines) throws IOException {
    Configuration conf = new Configuration();
    Path inputPath = new Path(inputDir);
    FileSystem fs = inputPath.getFileSystem(conf);
    int lineCount = 0;
    FileStatus[] files = fs.listStatus(inputPath);
    if (fs.exists(new Path(outputFile)))
        fs.delete(new Path(outputFile));

    SequenceFile.Writer outFile = SequenceFile.createWriter(fs, conf, new Path(outputFile), LongWritable.class,
            FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);

    for (int i = 0; i < files.length; i++) {
        if (files[i].isDir() || files[i].getPath().getName().startsWith("_"))
            continue;
        Reader reader = new SequenceFile.Reader(fs, files[i].getPath(), conf);
        LongWritable key = new LongWritable();
        FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

        while (reader.next(key, value) && lineCount < nLines) {
            lineCount++;/*from  ww  w  .  j a  va  2  s .c  om*/
            outFile.append(key, value);
        }
        reader.close();
        if (lineCount >= nLines)
            break;
    }
    outFile.close();
    System.out.println("Total lines copied: " + lineCount);
}

From source file:eu.scape_project.tb.lsdr.seqfileutility.SequenceFileWriter.java

License:Apache License

@Override
public void run() {
    try {/*  www .  ja va  2 s . c  o  m*/
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        path = new Path(uri);
        Class keyClass = Text.class;
        Class valueClass = BytesWritable.class;
        if (pc.isTextlinemode()) {
            keyClass = Text.class;
            valueClass = Text.class;
        }
        writer = SequenceFile.createWriter(fs, conf, path, keyClass, valueClass,
                CompressionType.get(pc.getCompressionType()));
        traverseDir(rootDir);
    } catch (Exception e) {
        logger.error(this.getId() + ": " + "IOException occurred", e);
    } finally {
        IOUtils.closeStream(writer);
    }
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp.java

License:LGPL

/**
 * Initialize DFSCopyFileMapper specific job-configuration.
 * @param conf : The dfs/mapred configuration.
 * @param jobConf : The handle to the jobConf object to be initialized.
 * @param args Arguments//w ww  .  j a v a2s .co m
 */
private static void setup(final Configuration conf, final JobConf jobConf, final Arguments args)
        throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    // set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname,
            args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP);

    FileSystem dstfs = args.dst.getFileSystem(conf);
    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
        dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
        String filename = "_distcp_logs_" + randomId;
        if (!dstExists || !dstIsDir) {
            Path parent = args.dst.getParent();
            if (null == parent) {
                // If dst is '/' on S3, it might not exist yet, but dst.getParent()
                // will return null. In this case, use '/' as its own parent to
                // prevent
                // NPE errors below.
                parent = args.dst;
            }
            if (!dstfs.exists(parent)) {
                dstfs.mkdirs(parent);
            }
            logPath = new Path(parent, filename);
        } else {
            logPath = new Path(args.dst, filename);
        }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class,
            Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
        for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) {
            final Path src = srcItr.next();
            FileSystem srcfs = src.getFileSystem(conf);
            FileStatus srcfilestat = srcfs.getFileStatus(src);
            Path root = special && srcfilestat.isDir() ? src : src.getParent();
            if (srcfilestat.isDir()) {
                ++srcCount;
            }

            Stack<FileStatus> pathstack = new Stack<>();
            for (pathstack.push(srcfilestat); !pathstack.empty();) {
                FileStatus cur = pathstack.pop();
                FileStatus[] children = srcfs.listStatus(cur.getPath());
                for (int i = 0; i < children.length; i++) {
                    boolean skipfile = false;
                    final FileStatus child = children[i];
                    final String dst = makeRelative(root, child.getPath());
                    ++srcCount;

                    if (child.isDir()) {
                        pathstack.push(child);
                    } else {
                        // skip file if the src and the dst files are the same.
                        skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst));
                        // skip file if it exceed file limit or size limit
                        skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

                        if (!skipfile) {
                            ++fileCount;
                            byteCount += child.getLen();

                            // if (LOG.isTraceEnabled()) {
                            // LOG.trace("adding file " + child.getPath());
                            // }

                            ++cnsyncf;
                            cbsyncs += child.getLen();
                            if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) {
                                src_writer.sync();
                                dst_writer.sync();
                                cnsyncf = 0;
                                cbsyncs = 0L;
                            }
                        }
                    }

                    if (!skipfile) {
                        src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()),
                                new FilePair(child, dst));
                    }

                    dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
                }

                if (cur.isDir()) {
                    String dst = makeRelative(root, cur.getPath());
                    dir_writer.append(new Text(dst), new FilePair(cur, dst));
                    if (++dirsyn > SYNC_FILE_MAX) {
                        dirsyn = 0;
                        dir_writer.sync();
                    }
                }
            }
        }
    } finally {
        checkAndClose(src_writer);
        checkAndClose(dst_writer);
        checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
        dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
        getLogger().info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
        if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
            throw new IOException("Failed to create" + args.dst);
        }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
        deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir = new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());

    // Explicitly create the tmpDir to ensure that it can be cleaned
    // up by fullyDelete() later.
    tmpDir.getFileSystem(conf).mkdirs(tmpDir);

    getLogger().info("srcCount=" + srcCount);
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp.java

License:LGPL

/** Delete the dst files/dirs which do not exist in src */
static private void deleteNonexisting(final FileSystem dstfs, final FileStatus dstroot, final Path dstsorted,
        final FileSystem jobfs, final Path jobdir, final JobConf jobconf, final Configuration conf)
        throws IOException {
    if (!dstroot.isDir()) {
        throw new IOException("dst must be a directory when option " + Options.DELETE.cmd
                + " is set, but dst (= " + dstroot.getPath() + ") is not a directory.");
    }/*from   w w  w. j a v  a 2 s  .  c o  m*/

    // write dst lsr results
    final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr");
    final SequenceFile.Writer writer = SequenceFile.createWriter(jobfs, jobconf, dstlsr, Text.class,
            dstroot.getClass(), SequenceFile.CompressionType.NONE);
    try {
        // do lsr to get all file statuses in dstroot
        final Stack<FileStatus> lsrstack = new Stack<>();
        for (lsrstack.push(dstroot); !lsrstack.isEmpty();) {
            final FileStatus status = lsrstack.pop();
            if (status.isDir()) {
                for (FileStatus child : dstfs.listStatus(status.getPath())) {
                    String relative = makeRelative(dstroot.getPath(), child.getPath());
                    writer.append(new Text(relative), child);
                    lsrstack.push(child);
                }
            }
        }
    } finally {
        checkAndClose(writer);
    }

    // sort lsr results
    final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted");
    SequenceFile.Sorter sorter = new SequenceFile.Sorter(jobfs, new Text.Comparator(), Text.class,
            FileStatus.class, jobconf);
    sorter.sort(dstlsr, sortedlsr);

    // compare lsr list and dst list
    SequenceFile.Reader lsrin = null;
    SequenceFile.Reader dstin = null;
    try {
        lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf);
        dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf);

        // compare sorted lsr list and sorted dst list
        final Text lsrpath = new Text();
        final FileStatus lsrstatus = new FileStatus();
        final Text dstpath = new Text();
        final Text dstfrom = new Text();
        final FsShell shell = new FsShell(conf);
        final String[] shellargs = { "-rmr", null };

        boolean hasnext = dstin.next(dstpath, dstfrom);
        for (; lsrin.next(lsrpath, lsrstatus);) {
            int dst_cmp_lsr = dstpath.compareTo(lsrpath);
            for (; hasnext && dst_cmp_lsr < 0;) {
                hasnext = dstin.next(dstpath, dstfrom);
                dst_cmp_lsr = dstpath.compareTo(lsrpath);
            }

            if (dst_cmp_lsr == 0) {
                // lsrpath exists in dst, skip it
                hasnext = dstin.next(dstpath, dstfrom);
            } else {
                // lsrpath does not exist, delete it
                String s = new Path(dstroot.getPath(), lsrpath.toString()).toString();
                if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) {
                    shellargs[1] = s;
                    int r = 0;
                    try {
                        r = shell.run(shellargs);
                    } catch (Exception e) {
                        throw new IOException("Exception from shell.", e);
                    }
                    if (r != 0) {
                        throw new IOException(
                                "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r);
                    }
                }
            }
        }
    } finally {
        checkAndClose(lsrin);
        checkAndClose(dstin);
    }
}

From source file:functionaltests.ext.mapreduce.TestMapReduce3Jobs.java

License:Apache License

@org.junit.Test
public void run() throws Exception {

    Path TEST_ROOT_DIR = new Path(System.getProperty("java.io.tmpdir") + File.separator + "TestMapReduce3Jobs");

    fs.delete(TEST_ROOT_DIR, true);/* www  .  j a  va  2  s .com*/

    //
    // Generate distribution of ints. This is the answer key.
    //
    Configuration conf = new Configuration();
    int countsToGo = counts;
    int dist[] = new int[range];
    for (int i = 0; i < range; i++) {
        double avgInts = (1.0 * countsToGo) / (range - i);
        dist[i] = (int) Math.max(0, Math.round(avgInts + (Math.sqrt(avgInts) * r.nextGaussian())));
        countsToGo -= dist[i];
    }
    if (countsToGo > 0) {
        dist[dist.length - 1] += countsToGo;
    }

    //
    // Write the answer key to a file.
    //
    if (!fs.mkdirs(TEST_ROOT_DIR)) {
        throw new IOException("Mkdirs failed to create " + TEST_ROOT_DIR.toString());
    }

    Path randomInsRel = new Path("genins");
    Path randomIns = new Path(TEST_ROOT_DIR, randomInsRel);
    if (!fs.mkdirs(randomIns)) {
        throw new IOException("Mkdirs failed to create " + randomIns.toString());
    }

    Path answerkeyRel = new Path("answer.key");
    Path answerkey = new Path(randomIns, answerkeyRel);
    SequenceFile.Writer out = SequenceFile.createWriter(fs, conf, answerkey, IntWritable.class,
            IntWritable.class, SequenceFile.CompressionType.NONE);
    try {
        for (int i = 0; i < range; i++) {
            out.append(new IntWritable(i), new IntWritable(dist[i]));
        }
    } finally {
        out.close();
    }

    printFiles(randomIns, conf);

    //
    // Now we need to generate the random numbers according to
    // the above distribution.
    //
    // We create a lot of map tasks, each of which takes at least
    // one "line" of the distribution. (That is, a certain number
    // X is to be generated Y number of times.)
    //
    // A map task emits Y key/val pairs. The val is X. The key
    // is a randomly-generated number.
    //
    // The reduce task gets its input sorted by key. That is, sorted
    // in random order. It then emits a single line of text that
    // for the given values. It does not emit the key.
    //
    // Because there's just one reduce task, we emit a single big
    // file of random numbers.
    //
    Path randomOutsRel = new Path("genouts");
    Path randomOuts = new Path(TEST_ROOT_DIR, randomOutsRel);
    fs.delete(randomOuts, true);
    fs.mkdirs(randomOuts);

    Job genJob = new Job(conf, "gen job");
    // FileInputFormat.setInputPaths(genJob, randomIns);
    genJob.setInputFormatClass(SequenceFileInputFormat.class);
    genJob.setMapperClass(RandomGenMapper.class);
    // genJob.setMapperClass(TokenizerMapper.class);

    FileInputFormat.addInputPath(genJob, answerkeyRel);
    FileOutputFormat.setOutputPath(genJob, randomOutsRel);

    // FileOutputFormat.setOutputPath(genJob, randomOuts);
    genJob.setOutputKeyClass(IntWritable.class);
    genJob.setOutputValueClass(IntWritable.class);
    // genJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    genJob.setReducerClass(RandomGenReducer.class);
    genJob.setNumReduceTasks(1);

    PAMapReduceJobConfiguration pamrjc = MapReduceTHelper.getConfiguration();
    pamrjc.setInputSpace("file://" + randomIns);
    pamrjc.setOutputSpace("file://" + TEST_ROOT_DIR);

    MapReduceTHelper.submit(genJob, pamrjc);

    printFiles(randomOuts, conf);

    //
    // Next, we read the big file in and regenerate the
    // original map. It's split into a number of parts.
    // (That number is 'intermediateReduces'.)
    //
    // We have many map tasks, each of which read at least one
    // of the output numbers. For each number read in, the
    // map task emits a key/value pair where the key is the
    // number and the value is "1".
    //
    // We have a single reduce task, which receives its input
    // sorted by the key emitted above. For each key, there will
    // be a certain number of "1" values. The reduce task sums
    // these values to compute how many times the given key was
    // emitted.
    //
    // The reduce task then emits a key/val pair where the key
    // is the number in question, and the value is the number of
    // times the key was emitted. This is the same format as the
    // original answer key (except that numbers emitted zero times
    // will not appear in the regenerated key.) The answer set
    // is split into a number of pieces. A final MapReduce job
    // will merge them.
    //
    // There's not really a need to go to 10 reduces here
    // instead of 1. But we want to test what happens when
    // you have multiple reduces at once.
    //
    int intermediateReduces = 10;
    Path intermediateOutsRel = new Path("intermediateouts");
    Path intermediateOuts = new Path(TEST_ROOT_DIR, intermediateOutsRel);
    fs.delete(intermediateOuts, true);
    conf = new Configuration();
    Job checkJob = new Job(conf, "check job");
    // FileInputFormat.setInputPaths(checkJob, randomOuts);
    FileInputFormat.setInputPaths(checkJob, randomOutsRel);
    checkJob.setMapperClass(RandomCheckMapper.class);
    // checkJob.setInputFormatClass(TextInputFormat.class);

    FileOutputFormat.setOutputPath(checkJob, intermediateOutsRel);
    checkJob.setOutputKeyClass(IntWritable.class);
    checkJob.setOutputValueClass(IntWritable.class);
    checkJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    checkJob.setReducerClass(RandomCheckReducer.class);
    checkJob.setNumReduceTasks(intermediateReduces);

    pamrjc = MapReduceTHelper.getConfiguration();
    pamrjc.setInputSpace("file://" + TEST_ROOT_DIR);
    pamrjc.setOutputSpace("file://" + TEST_ROOT_DIR);

    MapReduceTHelper.submit(checkJob, pamrjc);

    printFiles(intermediateOuts, conf);

    //
    // OK, now we take the output from the last job and
    // merge it down to a single file. The map() and reduce()
    // functions don't really do anything except reemit tuples.
    // But by having a single reduce task here, we end up merging
    // all the files.
    //
    Path finalOutsRel = new Path("finalouts");
    Path finalOuts = new Path(TEST_ROOT_DIR, finalOutsRel);
    fs.delete(finalOuts, true);
    Job mergeJob = new Job(conf, "merge job");
    FileInputFormat.setInputPaths(mergeJob, intermediateOutsRel);
    mergeJob.setInputFormatClass(SequenceFileInputFormat.class);
    mergeJob.setMapperClass(MergeMapper.class);

    FileOutputFormat.setOutputPath(mergeJob, finalOutsRel);
    mergeJob.setOutputKeyClass(IntWritable.class);
    mergeJob.setOutputValueClass(IntWritable.class);
    mergeJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    mergeJob.setReducerClass(MergeReducer.class);
    mergeJob.setNumReduceTasks(1);

    pamrjc = MapReduceTHelper.getConfiguration();
    pamrjc.setInputSpace("file://" + TEST_ROOT_DIR);
    pamrjc.setOutputSpace("file://" + TEST_ROOT_DIR);

    MapReduceTHelper.submit(mergeJob, pamrjc);

    printFiles(finalOuts, conf);

    //
    // Finally, we compare the reconstructed answer key with the
    // original one. Remember, we need to ignore zero-count items
    // in the original key.
    //
    boolean success = true;
    try {
        File dir = new File(finalOuts.toString());
        System.out.println(finalOuts.toString());
        System.out.println(dir);
        String filename = dir.list()[0];
        Path recomputedkey = new Path(finalOuts, filename);
        System.out.println("++++++++++++++++ Path to recomputed key: " + recomputedkey);
        SequenceFile.Reader in = new SequenceFile.Reader(fs, recomputedkey, conf);
        int totalseen = 0;
        try {
            IntWritable key = new IntWritable();
            IntWritable val = new IntWritable();
            for (int i = 0; i < range; i++) {
                if (dist[i] == 0) {
                    continue;
                }
                if (!in.next(key, val)) {
                    System.err.println("Cannot read entry " + i);
                    success = false;
                    break;
                } else {
                    if (!((key.get() == i) && (val.get() == dist[i]))) {
                        System.err.println("Mismatch!  Pos=" + key.get() + ", i=" + i + ", val=" + val.get()
                                + ", dist[i]=" + dist[i]);
                        success = false;
                    }
                    totalseen += val.get();
                }
            }
            if (success) {
                if (in.next(key, val)) {
                    System.err.println("Unnecessary lines in recomputed key!");
                    success = false;
                }
            }
        } finally {
            in.close();
        }
        int originalTotal = 0;
        for (int i = 0; i < dist.length; i++) {
            originalTotal += dist[i];
        }
        System.out.println("Original sum: " + originalTotal);
        System.out.println("Recomputed sum: " + totalseen);

        //
        // Write to "results" whether the test succeeded or not.
        //
        Path resultFile = new Path(TEST_ROOT_DIR, "results");
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fs.create(resultFile)));
        try {
            bw.write("Success=" + success + "\n");
            System.out.println("Success=" + success);
        } finally {
            bw.close();
        }
        Assert.assertTrue("Test failed", success);
        fs.delete(TEST_ROOT_DIR, true);
    } catch (Throwable e) {
        Assert.assertTrue("Unexpected exception; test failed", false);
        e.printStackTrace();
    }
}

From source file:hk.newsRecommender.MatrixAndCluster.java

License:Open Source License

public static void matrix2Vector(Configuration conf, Path path) throws IOException {
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader = null;
    // ??SequenceFile????Name??
    reader = new SequenceFile.Reader(fs, path, conf);
    Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable val = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    Writer writer = null;/*from w  ww  .  ja v  a  2s .  c o m*/
    try {
        writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class,
                CompressionType.BLOCK);
        final IntWritable key1 = new IntWritable();
        final VectorWritable value = new VectorWritable();
        int lineNum = 0;
        Vector vector = null;
        while (reader.next(key, val)) {
            int index = 0;
            StringTokenizer st = new StringTokenizer(val.toString());
            // SequentialAccessSparseVector??NamedVector
            vector = new NamedVector(new SequentialAccessSparseVector(Cardinality), lineNum + "");
            while (st.hasMoreTokens()) {
                if (Integer.parseInt(st.nextToken()) == 1) {
                    vector.set(index, 1);
                }
                index++;
            }
            key1.set(lineNum++);
            value.set(vector);
            writer.append(key, value);
        }
    } finally {
        writer.close();
        reader.close();
    }
}

From source file:io.covert.binary.analysis.BuildSequenceFile.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    File inDir = new File(args[0]);
    Path name = new Path(args[1]);

    Text key = new Text();
    BytesWritable val = new BytesWritable();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, name, Text.class, BytesWritable.class,
            CompressionType.RECORD);/*from   ww  w .  j a  va  2  s . co  m*/

    for (File file : inDir.listFiles()) {
        if (!file.isFile()) {
            System.out.println("Skipping " + file + " (not a file) ...");
            continue;
        }

        FileInputStream fileIn = new FileInputStream(file);
        ByteArrayOutputStream bytesOut = new ByteArrayOutputStream((int) file.length());
        int b;
        while (-1 != (b = fileIn.read())) {
            bytesOut.write(b);
        }
        fileIn.close();
        bytesOut.close();
        byte[] bytes = bytesOut.toByteArray();

        val.set(bytes, 0, bytes.length);
        key.set(file.getName());

        writer.append(key, val);
    }
    writer.close();

    return 0;
}