Example usage for org.apache.hadoop.io SequenceFile.Sorter SequenceFile.Sorter

List of usage examples for org.apache.hadoop.io SequenceFile.Sorter SequenceFile.Sorter

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile.Sorter SequenceFile.Sorter.

Prototype

public Sorter(FileSystem fs, RawComparator comparator, Class keyClass, Class valClass, Configuration conf) 

Source Link

Document

Sort and merge using an arbitrary RawComparator .

Usage

From source file:com.kadwa.hadoop.DistExec.java

License:Open Source License

/** Check whether the file list have duplication. */
static private void checkDuplication(FileSystem fs, Path file, Path sorted, Configuration conf)
        throws IOException {
    SequenceFile.Reader in = null;
    try {/*ww w .  jav a 2  s.c  o m*/
        SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new Text.Comparator(), Text.class, Text.class,
                conf);
        sorter.sort(file, sorted);
        in = new SequenceFile.Reader(fs, sorted, conf);

        Text prevdst = null, curdst = new Text();
        Text prevsrc = null, cursrc = new Text();
        for (; in.next(curdst, cursrc);) {
            if (prevdst != null && curdst.equals(prevdst)) {
                throw new DuplicationException(
                        "Invalid input, there are duplicated files in the sources: " + prevsrc + ", " + cursrc);
            }
            prevdst = curdst;
            curdst = new Text();
            prevsrc = cursrc;
            cursrc = new Text();
        }
    } finally {
        checkAndClose(in);
    }
}

From source file:com.pinterest.hdfsbackup.distcp.DistCp.java

License:Apache License

/** Delete the dst files/dirs which do not exist in src */
static private void deleteNonexisting(FileSystem dstfs, FileStatus dstroot, Path dstsorted, FileSystem jobfs,
        Path jobdir, JobConf jobconf, Configuration conf) throws IOException {
    if (!dstroot.isDir()) {
        throw new IOException("dst must be a directory when option " + Options.DELETE.cmd
                + " is set, but dst (= " + dstroot.getPath() + ") is not a directory.");
    }//w w w .j  a  v  a2 s. com

    //write dst lsr results
    final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr");
    final SequenceFile.Writer writer = SequenceFile.createWriter(jobfs, jobconf, dstlsr, Text.class,
            FileStatus.class, SequenceFile.CompressionType.NONE);
    try {
        //do lsr to get all file statuses in dstroot
        final Stack<FileStatus> lsrstack = new Stack<FileStatus>();
        for (lsrstack.push(dstroot); !lsrstack.isEmpty();) {
            final FileStatus status = lsrstack.pop();
            if (status.isDir()) {
                for (FileStatus child : dstfs.listStatus(status.getPath())) {
                    String relative = makeRelative(dstroot.getPath(), child.getPath());
                    writer.append(new Text(relative), child);
                    lsrstack.push(child);
                }
            }
        }
    } finally {
        checkAndClose(writer);
    }

    //sort lsr results
    final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted");
    SequenceFile.Sorter sorter = new SequenceFile.Sorter(jobfs, new Text.Comparator(), Text.class,
            FileStatus.class, jobconf);
    sorter.sort(dstlsr, sortedlsr);

    //compare lsr list and dst list
    SequenceFile.Reader lsrin = null;
    SequenceFile.Reader dstin = null;
    try {
        lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf);
        dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf);

        //compare sorted lsr list and sorted dst list
        final Text lsrpath = new Text();
        final FileStatus lsrstatus = new FileStatus();
        final Text dstpath = new Text();
        final Text dstfrom = new Text();
        final FsShell shell = new FsShell(conf);
        final String[] shellargs = { "-rmr", null };

        boolean hasnext = dstin.next(dstpath, dstfrom);
        for (; lsrin.next(lsrpath, lsrstatus);) {
            int dst_cmp_lsr = dstpath.compareTo(lsrpath);
            for (; hasnext && dst_cmp_lsr < 0;) {
                hasnext = dstin.next(dstpath, dstfrom);
                dst_cmp_lsr = dstpath.compareTo(lsrpath);
            }

            if (dst_cmp_lsr == 0) {
                //lsrpath exists in dst, skip it
                hasnext = dstin.next(dstpath, dstfrom);
            } else {
                //lsrpath does not exist, delete it
                String s = new Path(dstroot.getPath(), lsrpath.toString()).toString();
                if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) {
                    shellargs[1] = s;
                    int r = 0;
                    try {
                        r = shell.run(shellargs);
                    } catch (Exception e) {
                        throw new IOException("Exception from shell.", e);
                    }
                    if (r != 0) {
                        throw new IOException(
                                "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r);
                    }
                }
            }
        }
    } finally {
        checkAndClose(lsrin);
        checkAndClose(dstin);
    }
}

From source file:org.apache.mrql.Bag.java

License:Apache License

/**
 * sort the Bag (cache it in memory if necessary).
 * If the Bag was spilled during caching, use external sorting
 *///from  w w w. j  a  v a 2  s. c o  m
public void sort() {
    materialize();
    if (spilled()) // if it was spilled during materialize()
        try { // use external sorting
            if (writer != null)
                writer.close();
            FileSystem fs = FileSystem.getLocal(Plan.conf);
            SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new Plan.MRContainerKeyComparator(),
                    MRContainer.class, NullWritable.class, Plan.conf);
            String out_path = new_path(fs);
            System.err.println("*** Using external sorting on a spilled bag " + path + " -> " + out_path);
            sorter.setMemory(64 * 1024 * 1024);
            sorter.sort(new Path(path), new Path(out_path));
            path = out_path;
            writer = null;
        } catch (Exception ex) {
            throw new Error("Cannot sort a spilled bag");
        }
    else
        Collections.sort(content);
}

From source file:org.apache.mrql.Plan.java

License:Apache License

/** merge the sorted files of the data source */
public final static Bag merge(final DataSource s) throws Exception {
    Path path = new Path(s.path);
    final FileSystem fs = path.getFileSystem(conf);
    final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }/*from   w w  w . j a  v  a 2s.  co  m*/
    });
    int dl = ds.length;
    if (dl == 0)
        return new Bag();
    Path[] paths = new Path[dl];
    for (int i = 0; i < dl; i++)
        paths[i] = ds[i].getPath();
    if (dl > Config.max_merged_streams) {
        if (Config.trace)
            System.out.println("Merging " + dl + " files");
        Path out_path = new Path(new_path(conf));
        SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new MRContainerKeyComparator(),
                MRContainer.class, MRContainer.class, conf);
        sorter.merge(paths, out_path);
        paths = new Path[1];
        paths[0] = out_path;
    }
    ;
    final int n = paths.length;
    SequenceFile.Reader[] sreaders = new SequenceFile.Reader[n];
    for (int i = 0; i < n; i++)
        sreaders[i] = new SequenceFile.Reader(fs, paths[i], conf);
    final SequenceFile.Reader[] readers = sreaders;
    final MRContainer[] keys_ = new MRContainer[n];
    final MRContainer[] values_ = new MRContainer[n];
    for (int i = 0; i < n; i++) {
        keys_[i] = new MRContainer();
        values_[i] = new MRContainer();
    }
    ;
    return new Bag(new BagIterator() {
        int min = 0;
        boolean first = true;
        final MRContainer[] keys = keys_;
        final MRContainer[] values = values_;
        final MRContainer key = new MRContainer();
        final MRContainer value = new MRContainer();

        public boolean hasNext() {
            if (first)
                try {
                    first = false;
                    for (int i = 0; i < n; i++)
                        if (readers[i].next(key, value)) {
                            keys[i].set(key.data());
                            values[i].set(value.data());
                        } else {
                            keys[i] = null;
                            readers[i].close();
                        }
                } catch (IOException e) {
                    throw new Error("Cannot merge values from an intermediate result");
                }
            ;
            min = -1;
            for (int i = 0; i < n; i++)
                if (keys[i] != null && min < 0)
                    min = i;
                else if (keys[i] != null && keys[i].compareTo(keys[min]) < 0)
                    min = i;
            return min >= 0;
        }

        public MRData next() {
            try {
                MRData res = values[min].data();
                if (readers[min].next(key, value)) {
                    keys[min].set(key.data());
                    values[min].set(value.data());
                } else {
                    keys[min] = null;
                    readers[min].close();
                }
                ;
                return res;
            } catch (IOException e) {
                throw new Error("Cannot merge values from an intermediate result");
            }
        }
    });
}