List of usage examples for org.apache.hadoop.io SequenceFile.Sorter SequenceFile.Sorter
public Sorter(FileSystem fs, RawComparator comparator, Class keyClass, Class valClass, Configuration conf)
From source file:com.kadwa.hadoop.DistExec.java
License:Open Source License
/** Check whether the file list have duplication. */ static private void checkDuplication(FileSystem fs, Path file, Path sorted, Configuration conf) throws IOException { SequenceFile.Reader in = null; try {/*ww w . jav a 2 s.c o m*/ SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new Text.Comparator(), Text.class, Text.class, conf); sorter.sort(file, sorted); in = new SequenceFile.Reader(fs, sorted, conf); Text prevdst = null, curdst = new Text(); Text prevsrc = null, cursrc = new Text(); for (; in.next(curdst, cursrc);) { if (prevdst != null && curdst.equals(prevdst)) { throw new DuplicationException( "Invalid input, there are duplicated files in the sources: " + prevsrc + ", " + cursrc); } prevdst = curdst; curdst = new Text(); prevsrc = cursrc; cursrc = new Text(); } } finally { checkAndClose(in); } }
From source file:com.pinterest.hdfsbackup.distcp.DistCp.java
License:Apache License
/** Delete the dst files/dirs which do not exist in src */ static private void deleteNonexisting(FileSystem dstfs, FileStatus dstroot, Path dstsorted, FileSystem jobfs, Path jobdir, JobConf jobconf, Configuration conf) throws IOException { if (!dstroot.isDir()) { throw new IOException("dst must be a directory when option " + Options.DELETE.cmd + " is set, but dst (= " + dstroot.getPath() + ") is not a directory."); }//w w w .j a v a2 s. com //write dst lsr results final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr"); final SequenceFile.Writer writer = SequenceFile.createWriter(jobfs, jobconf, dstlsr, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE); try { //do lsr to get all file statuses in dstroot final Stack<FileStatus> lsrstack = new Stack<FileStatus>(); for (lsrstack.push(dstroot); !lsrstack.isEmpty();) { final FileStatus status = lsrstack.pop(); if (status.isDir()) { for (FileStatus child : dstfs.listStatus(status.getPath())) { String relative = makeRelative(dstroot.getPath(), child.getPath()); writer.append(new Text(relative), child); lsrstack.push(child); } } } } finally { checkAndClose(writer); } //sort lsr results final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted"); SequenceFile.Sorter sorter = new SequenceFile.Sorter(jobfs, new Text.Comparator(), Text.class, FileStatus.class, jobconf); sorter.sort(dstlsr, sortedlsr); //compare lsr list and dst list SequenceFile.Reader lsrin = null; SequenceFile.Reader dstin = null; try { lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf); dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf); //compare sorted lsr list and sorted dst list final Text lsrpath = new Text(); final FileStatus lsrstatus = new FileStatus(); final Text dstpath = new Text(); final Text dstfrom = new Text(); final FsShell shell = new FsShell(conf); final String[] shellargs = { "-rmr", null }; boolean hasnext = dstin.next(dstpath, dstfrom); for (; lsrin.next(lsrpath, lsrstatus);) { int dst_cmp_lsr = dstpath.compareTo(lsrpath); for (; hasnext && dst_cmp_lsr < 0;) { hasnext = dstin.next(dstpath, dstfrom); dst_cmp_lsr = dstpath.compareTo(lsrpath); } if (dst_cmp_lsr == 0) { //lsrpath exists in dst, skip it hasnext = dstin.next(dstpath, dstfrom); } else { //lsrpath does not exist, delete it String s = new Path(dstroot.getPath(), lsrpath.toString()).toString(); if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) { shellargs[1] = s; int r = 0; try { r = shell.run(shellargs); } catch (Exception e) { throw new IOException("Exception from shell.", e); } if (r != 0) { throw new IOException( "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r); } } } } } finally { checkAndClose(lsrin); checkAndClose(dstin); } }
From source file:org.apache.mrql.Bag.java
License:Apache License
/** * sort the Bag (cache it in memory if necessary). * If the Bag was spilled during caching, use external sorting *///from w w w. j a v a 2 s. c o m public void sort() { materialize(); if (spilled()) // if it was spilled during materialize() try { // use external sorting if (writer != null) writer.close(); FileSystem fs = FileSystem.getLocal(Plan.conf); SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new Plan.MRContainerKeyComparator(), MRContainer.class, NullWritable.class, Plan.conf); String out_path = new_path(fs); System.err.println("*** Using external sorting on a spilled bag " + path + " -> " + out_path); sorter.setMemory(64 * 1024 * 1024); sorter.sort(new Path(path), new Path(out_path)); path = out_path; writer = null; } catch (Exception ex) { throw new Error("Cannot sort a spilled bag"); } else Collections.sort(content); }
From source file:org.apache.mrql.Plan.java
License:Apache License
/** merge the sorted files of the data source */ public final static Bag merge(final DataSource s) throws Exception { Path path = new Path(s.path); final FileSystem fs = path.getFileSystem(conf); final FileStatus[] ds = fs.listStatus(path, new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); }/*from w w w . j a v a 2s. co m*/ }); int dl = ds.length; if (dl == 0) return new Bag(); Path[] paths = new Path[dl]; for (int i = 0; i < dl; i++) paths[i] = ds[i].getPath(); if (dl > Config.max_merged_streams) { if (Config.trace) System.out.println("Merging " + dl + " files"); Path out_path = new Path(new_path(conf)); SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new MRContainerKeyComparator(), MRContainer.class, MRContainer.class, conf); sorter.merge(paths, out_path); paths = new Path[1]; paths[0] = out_path; } ; final int n = paths.length; SequenceFile.Reader[] sreaders = new SequenceFile.Reader[n]; for (int i = 0; i < n; i++) sreaders[i] = new SequenceFile.Reader(fs, paths[i], conf); final SequenceFile.Reader[] readers = sreaders; final MRContainer[] keys_ = new MRContainer[n]; final MRContainer[] values_ = new MRContainer[n]; for (int i = 0; i < n; i++) { keys_[i] = new MRContainer(); values_[i] = new MRContainer(); } ; return new Bag(new BagIterator() { int min = 0; boolean first = true; final MRContainer[] keys = keys_; final MRContainer[] values = values_; final MRContainer key = new MRContainer(); final MRContainer value = new MRContainer(); public boolean hasNext() { if (first) try { first = false; for (int i = 0; i < n; i++) if (readers[i].next(key, value)) { keys[i].set(key.data()); values[i].set(value.data()); } else { keys[i] = null; readers[i].close(); } } catch (IOException e) { throw new Error("Cannot merge values from an intermediate result"); } ; min = -1; for (int i = 0; i < n; i++) if (keys[i] != null && min < 0) min = i; else if (keys[i] != null && keys[i].compareTo(keys[min]) < 0) min = i; return min >= 0; } public MRData next() { try { MRData res = values[min].data(); if (readers[min].next(key, value)) { keys[min].set(key.data()); values[min].set(value.data()); } else { keys[min] = null; readers[min].close(); } ; return res; } catch (IOException e) { throw new Error("Cannot merge values from an intermediate result"); } } }); }