Example usage for org.apache.hadoop.io SequenceFile createWriter

List of usage examples for org.apache.hadoop.io SequenceFile createWriter

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile createWriter.

Prototype

@Deprecated
public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec) throws IOException 

Source Link

Document

Construct the preferred type of 'raw' SequenceFile Writer.

Usage

From source file:org.commoncrawl.service.listcrawler.HDFSFlusherThread.java

License:Open Source License

@Override
public void run() {

    boolean shutdown = false;

    while (!shutdown) {

        try {/* w  ww .j  av  a 2  s .  c  o m*/

            final CacheFlushRequest request = _manager.getHDFSFlushRequestQueue().take();

            switch (request._requestType) {

            case ExitThreadRequest: {
                // shutdown condition ... 
                CacheManager.LOG.info("Cache Flusher Thread Received Shutdown. Exiting!");
                shutdown = true;
            }
                break;

            case FlushRequest: {

                LOG.info("Received Flush Request");

                ArrayList<IndexDataFileTriple> tempFiles = new ArrayList<IndexDataFileTriple>();
                ArrayList<FingerprintAndOffsetTuple> tuplesOut = new ArrayList<FingerprintAndOffsetTuple>();

                // flag to track request status at end .. 
                boolean requestFailed = false;

                long logStart = LocalLogFileHeader.SIZE;
                long logEnd = logStart + request._bytesToFlush;

                // create a hdfs temp file for data (and index)
                long generateTime = System.currentTimeMillis();
                Path tempDir = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".")
                        + "/flusher-temp-" + generateTime);

                // mkdir ... 
                try {
                    _manager.getRemoteFileSystem().mkdirs(tempDir);
                } catch (IOException e1) {
                    LOG.error(CCStringUtils.stringifyException(e1));
                    requestFailed = true;
                }

                int iterationNumber = 0;

                while (logStart != logEnd && !requestFailed) {

                    Path tempDataFile = new Path(tempDir,
                            CacheManager.PROXY_CACHE_FILE_DATA_PREFIX + "-" + iterationNumber);
                    Path tempIndexFile = new Path(tempDir,
                            CacheManager.PROXY_CACHE_FILE_INDEX_PREFIX + "-" + iterationNumber);

                    LOG.info("FlushRequest Pass#:" + iterationNumber + " DataPath:" + tempDataFile
                            + " IndexPath:" + tempIndexFile);

                    SequenceFile.Writer writer = null;
                    FSDataOutputStream indexOutputStream = null;
                    RandomAccessFile localLogFile = null;

                    try {

                        LOG.info("Pass#:" + iterationNumber + " Opening SequenceFile for Output");
                        // open a temporary hdfs streams ...
                        writer = SequenceFile.createWriter(_manager.getRemoteFileSystem(),
                                CrawlEnvironment.getHadoopConfig(), tempDataFile, Text.class, CacheItem.class,
                                CompressionType.NONE);

                        // opening index output stream ... 
                        LOG.info("Pass#:" + iterationNumber + " Opening Index Output Stream");
                        indexOutputStream = _manager.getRemoteFileSystem().create(tempIndexFile);

                        LOG.info("Pass#:" + iterationNumber + " Opening Local Log");
                        localLogFile = new RandomAccessFile(_manager.getActiveLogFilePath(), "rw");

                        // transfer log entries and generate index
                        logStart = generateSequenceFileAndIndex(_manager.getCacheFlushThreshold(), localLogFile,
                                logStart, logEnd, _manager.getLocalLogSyncBytes(), writer, indexOutputStream,
                                tuplesOut);
                    } catch (IOException e) {
                        CacheManager.LOG.error(CCStringUtils.stringifyException(e));
                        requestFailed = true;
                    } finally {
                        if (writer != null) {
                            try {
                                writer.close();
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            }
                        }
                        if (indexOutputStream != null) {
                            try {
                                indexOutputStream.close();
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            }
                        }
                        if (localLogFile != null) {
                            try {
                                localLogFile.close();
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            }
                        }
                    }

                    if (requestFailed) {
                        try {
                            LOG.info("Pass#:" + iterationNumber + " Failed. Deleting temp files");
                            _manager.getRemoteFileSystem().delete(tempDataFile, false);
                            _manager.getRemoteFileSystem().delete(tempIndexFile, false);
                        } catch (IOException e) {
                            LOG.error("Delete Failed During Failure! Potenital Orphan Files! : "
                                    + CCStringUtils.stringifyException(e));
                        }
                        break;
                    } else {
                        LOG.info("Pass#:" + iterationNumber + " Finished. Adding files to tuple list");
                        // add temp file tuple
                        IndexDataFileTriple indexDataPair = new IndexDataFileTriple();

                        indexDataPair._dataFilePath = tempDataFile;
                        indexDataPair._indexFilePath = tempIndexFile;

                        tempFiles.add(indexDataPair);
                    }
                    iterationNumber++;
                }

                LOG.info("All Passes Complete. Finalizing Commit");

                // ok if request failed ... 
                if (!requestFailed) {

                    int itemIndex = 0;
                    for (IndexDataFileTriple indexDataPair : tempFiles) {
                        // generate final paths ... 
                        Path finalOutputDir = _manager.getRemoteDataDirectory();

                        Path finalDataFilePath = new Path(finalOutputDir,
                                CacheManager.PROXY_CACHE_FILE_DATA_PREFIX + "-" + (generateTime + itemIndex));
                        Path finalIndexFilePath = new Path(finalOutputDir,
                                CacheManager.PROXY_CACHE_FILE_INDEX_PREFIX + "-" + (generateTime + itemIndex));

                        try {
                            LOG.info("Pass#:" + itemIndex + " Renaming Temp Files");
                            LOG.info("Pass#:" + itemIndex + " Final Data File Name is:" + finalDataFilePath);
                            LOG.info("Pass#:" + itemIndex + " Final Index File Name is:" + finalIndexFilePath);

                            // rename files ... 
                            _manager.getRemoteFileSystem().rename(indexDataPair._dataFilePath,
                                    finalDataFilePath);
                            indexDataPair._dataFilePath = finalDataFilePath;
                            _manager.getRemoteFileSystem().rename(indexDataPair._indexFilePath,
                                    finalIndexFilePath);
                            indexDataPair._indexFilePath = finalIndexFilePath;
                        } catch (IOException e) {
                            LOG.info("Pass#:" + itemIndex + " Rename Failed");
                            LOG.error(CCStringUtils.stringifyException(e));
                            requestFailed = true;
                            break;
                        }

                        try {
                            // copy to local ...
                            indexDataPair._localIndexFilePath = new File(_manager.getLocalDataDirectory(),
                                    finalIndexFilePath.getName());

                            LOG.info("Pass#:" + itemIndex + " Copying Remote Index File at:"
                                    + finalIndexFilePath + " to Local Directory:"
                                    + indexDataPair._localIndexFilePath.getAbsolutePath());
                            _manager.getRemoteFileSystem().copyToLocalFile(finalIndexFilePath,
                                    new Path(indexDataPair._localIndexFilePath.getAbsolutePath()));
                            LOG.info("Pass#:" + itemIndex + " Done Copying Remote Index File to Local");
                        } catch (IOException e) {
                            LOG.info("Pass#:" + itemIndex + " Local File Copy Failed with Exception:"
                                    + CCStringUtils.stringifyException(e));
                            requestFailed = true;
                            indexDataPair._localIndexFilePath = null;
                            break;
                        }
                        // inrement item index 
                        itemIndex++;
                    }
                    // ok callback to manager if request succeeded 
                    if (!requestFailed) {
                        try {
                            LOG.info("Flush Complete. Calling hdfsFlushComplete");
                            _manager.hdfsCacheFlushRequestComplete(request, tuplesOut, tempFiles);
                            LOG.info("Flush Complete. hdfsFlushComplete succeeded");
                        } catch (IOException e) {

                            LOG.error("hdfsFlushComplete returned Exception:"
                                    + CCStringUtils.stringifyException(e));
                            requestFailed = true;
                        }
                    }

                }

                if (requestFailed) {
                    LOG.info("Cache Manager Log Flush Failed. Deleteing files");
                    try {
                        // delete temp file directory recursively 
                        _manager.getRemoteFileSystem().delete(tempDir, true);
                    } catch (IOException e) {
                        LOG.error(CCStringUtils.stringifyException(e));
                    }
                    // iterate temp file list 
                    for (IndexDataFileTriple triple : tempFiles) {
                        try {
                            LOG.info("Deleteing:" + triple._dataFilePath);
                            _manager.getRemoteFileSystem().delete(triple._dataFilePath, false);
                            LOG.info("Deleteing:" + triple._indexFilePath);
                            _manager.getRemoteFileSystem().delete(triple._indexFilePath, false);
                            if (triple._localIndexFilePath != null) {
                                LOG.info("Deleteing LOCAL:" + triple._localIndexFilePath);
                                triple._localIndexFilePath.delete();
                            }
                        } catch (IOException e) {
                            LOG.error(CCStringUtils.stringifyException(e));
                        }
                    }
                    // callback to manager with the bad news ... 
                    _manager.hdfsCacheFlushRequestFailed(request);
                }
            }
                break;
            }
        } catch (InterruptedException e) {
            LOG.error("Unexpected Exception in HDFSFlusher Thread:" + CCStringUtils.stringifyException(e));
        }
    }
}

From source file:org.commoncrawl.util.MapReduceJobStatsWriter.java

License:Open Source License

/** Constructor
 * /*from ww w .  jav a 2  s.co  m*/
 * @param keyClass      key type
 * @param valueClass    value type
 * @param familyKey         
 * @param groupingKey
 * @param uniqueKey
 */
public MapReduceJobStatsWriter(FileSystem remoteFileSystem, Configuration config, Class<KeyType> keyClass,
        Class<ValueType> valueClass, String familyKey, String groupingKey, long uniqueKey) throws IOException {

    _logFamily = familyKey;
    _groupingKey = groupingKey;
    _uniqueKey = uniqueKey;
    _remoteFileSystem = remoteFileSystem;
    _config = config;
    // temp file 
    _tempFileName = File.createTempFile("statsWriter", "seq");
    // create output stream that sequence file writer will output to
    _outputStream = FileSystem.getLocal(_config).create(new Path(_tempFileName.getAbsolutePath()));

    LzoCodec codec = new LzoCodec();
    // create sequencefile writer 
    _writer = SequenceFile.createWriter(config, _outputStream, keyClass, valueClass, CompressionType.BLOCK,
            codec);
    // start event loop
    _eventLoop.start();
}

From source file:org.honu.datacollection.writer.localfs.LockFreeWriter.java

License:Apache License

protected void rotate() {
    Tracer t = Tracer.startNewTracer("honu.server." + group + ".rotateDataSink");
    isRunning = true;//w  w w.  j a  v a2  s.  c  o m
    calendar.setTimeInMillis(System.currentTimeMillis());
    log.info(group + "- start Date [" + calendar.getTime() + "]");
    log.info(group + "- Rotate from " + Thread.currentThread().getName());

    String newName = day.format(calendar.getTime());
    newName += localHostAddr + new java.rmi.server.UID().toString();
    newName = newName.replace("-", "");
    newName = newName.replace(":", "");
    // newName = newName.replace(".", "");
    newName = localOutputDir + "/" + newName.trim();

    try {
        FSDataOutputStream previousOutputStr = currentOutputStr;
        Path previousPath = currentPath;
        String previousFileName = currentFileName;

        if (previousOutputStr != null) {
            seqFileWriter.close();
            previousOutputStr.close();
            if (chunksWrittenThisRotate) {
                fs.rename(previousPath, new Path(previousFileName + ".done"));
                fileQueue.add(previousFileName + ".done");
            } else {
                log.info(group + "- no chunks written to " + previousPath + ", deleting");
                fs.delete(previousPath, false);
            }
        }

        Path newOutputPath = new Path(newName + ".chukwa");
        FSDataOutputStream newOutputStr = fs.create(newOutputPath);

        currentOutputStr = newOutputStr;
        currentPath = newOutputPath;
        currentFileName = newName;
        chunksWrittenThisRotate = false;

        if (codec != null) {
            seqFileWriter = SequenceFile.createWriter(conf, newOutputStr, ChukwaArchiveKey.class,
                    ChunkImpl.class, SequenceFile.CompressionType.BLOCK, codec);
        } else {
            seqFileWriter = SequenceFile.createWriter(conf, newOutputStr, ChukwaArchiveKey.class,
                    ChunkImpl.class, SequenceFile.CompressionType.NONE, codec);
        }

    } catch (Throwable e) {
        if (t != null) {
            t.stopAndLogTracer();
        }

        log.fatal(group + "- Throwable Exception in rotate. Exiting!", e);
        // Shutting down the collector
        // Watchdog will re-start it automatically
        DaemonWatcher.bailout(-1);
    }

    // Check for disk space
    File directory4Space = new File(localOutputDir);
    long totalSpace = directory4Space.getTotalSpace();
    long freeSpace = directory4Space.getFreeSpace();
    long minFreeAvailable = (totalSpace * minPercentFreeDisk) / 100;

    if (log.isDebugEnabled()) {
        log.debug(group + "- Directory: " + localOutputDir + ", totalSpace: " + totalSpace + ", freeSpace: "
                + freeSpace + ", minFreeAvailable: " + minFreeAvailable + ", percentFreeDisk: "
                + minPercentFreeDisk);
    }

    if (freeSpace < minFreeAvailable) {
        log.fatal(group + "- No space left on device, Bail out!");
        DaemonWatcher.bailout(-1);
    }
    nextRotate = System.currentTimeMillis() + rotateInterval;

    if (t != null) {
        t.stopAndLogTracer();
    }
}

From source file:org.jd.copier.mapred.DistCp.java

License:Apache License

/**
 * Initialize DFSCopyFileMapper specific job-configuration.
 * @param conf : The dfs/mapred configuration.
 * @param jobConf : The handle to the jobConf object to be initialized.
 * @param args Arguments/*  w ww .java 2  s .  c o m*/
 * @return true if it is necessary to launch a job.
 */
private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    //set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean skipCRCCheck = args.flags.contains(Options.SKIPCRC);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.SKIPCRC.propertyname, skipCRCCheck);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname,
            args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path stagingArea;
    try {
        stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }

    Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId);
    FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(jClient.getFs(), jobDirectory, mapredSysPerms);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP);

    FileSystem dstfs = args.dst.getFileSystem(conf);

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf);

    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
        dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
        String filename = "_distcp_logs_" + randomId;
        if (!dstExists || !dstIsDir) {
            Path parent = args.dst.getParent();
            if (null == parent) {
                // If dst is '/' on S3, it might not exist yet, but dst.getParent()
                // will return null. In this case, use '/' as its own parent to prevent
                // NPE errors below.
                parent = args.dst;
            }
            if (!dstfs.exists(parent)) {
                dstfs.mkdirs(parent);
            }
            logPath = new Path(parent, filename);
        } else {
            logPath = new Path(args.dst, filename);
        }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class,
            Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
        for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) {
            final Path src = srcItr.next();
            FileSystem srcfs = src.getFileSystem(conf);
            FileStatus srcfilestat = srcfs.getFileStatus(src);
            Path root = special && srcfilestat.isDir() ? src : src.getParent();
            if (srcfilestat.isDir()) {
                ++srcCount;
            }

            Stack<FileStatus> pathstack = new Stack<FileStatus>();
            for (pathstack.push(srcfilestat); !pathstack.empty();) {
                FileStatus cur = pathstack.pop();
                FileStatus[] children = srcfs.listStatus(cur.getPath());
                for (int i = 0; i < children.length; i++) {
                    boolean skipfile = false;
                    final FileStatus child = children[i];
                    final String dst = makeRelative(root, child.getPath());
                    ++srcCount;

                    if (child.isDir()) {
                        pathstack.push(child);
                    } else {
                        //skip file if the src and the dst files are the same.
                        skipfile = update
                                && sameFile(srcfs, child, dstfs, new Path(args.dst, dst), skipCRCCheck);
                        //skip file if it exceed file limit or size limit
                        skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

                        if (!skipfile) {
                            ++fileCount;
                            byteCount += child.getLen();

                            if (LOG.isTraceEnabled()) {
                                LOG.trace("adding file " + child.getPath());
                            }

                            ++cnsyncf;
                            cbsyncs += child.getLen();
                            if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) {
                                src_writer.sync();
                                dst_writer.sync();
                                cnsyncf = 0;
                                cbsyncs = 0L;
                            }
                        }
                    }

                    if (!skipfile) {
                        src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()),
                                new FilePair(child, dst));
                    }

                    dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
                }

                if (cur.isDir()) {
                    String dst = makeRelative(root, cur.getPath());
                    dir_writer.append(new Text(dst), new FilePair(cur, dst));
                    if (++dirsyn > SYNC_FILE_MAX) {
                        dirsyn = 0;
                        dir_writer.sync();
                    }
                }
            }
        }
    } finally {
        checkAndClose(src_writer);
        checkAndClose(dst_writer);
        checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
        dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
        LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
        if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
            throw new IOException("Failed to create" + args.dst);
        }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
        deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir = new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());

    // Explicitly create the tmpDir to ensure that it can be cleaned
    // up by fullyDelete() later.
    tmpDir.getFileSystem(conf).mkdirs(tmpDir);

    LOG.info("sourcePathsCount=" + srcCount);
    LOG.info("filesToCopyCount=" + fileCount);
    LOG.info("bytesToCopyCount=" + StringUtils.humanReadableInt(byteCount));
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
    return fileCount > 0;
}

From source file:org.jd.copier.mapred.DistCp.java

License:Apache License

/** Delete the dst files/dirs which do not exist in src */
static private void deleteNonexisting(FileSystem dstfs, FileStatus dstroot, Path dstsorted, FileSystem jobfs,
        Path jobdir, JobConf jobconf, Configuration conf) throws IOException {
    if (!dstroot.isDir()) {
        throw new IOException("dst must be a directory when option " + Options.DELETE.cmd
                + " is set, but dst (= " + dstroot.getPath() + ") is not a directory.");
    }//from  www  .  j  a va2 s .  c om

    //write dst lsr results
    final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr");
    final SequenceFile.Writer writer = SequenceFile.createWriter(jobfs, jobconf, dstlsr, Text.class,
            dstroot.getClass(), SequenceFile.CompressionType.NONE);
    try {
        //do lsr to get all file statuses in dstroot
        final Stack<FileStatus> lsrstack = new Stack<FileStatus>();
        for (lsrstack.push(dstroot); !lsrstack.isEmpty();) {
            final FileStatus status = lsrstack.pop();
            if (status.isDir()) {
                for (FileStatus child : dstfs.listStatus(status.getPath())) {
                    String relative = makeRelative(dstroot.getPath(), child.getPath());
                    writer.append(new Text(relative), child);
                    lsrstack.push(child);
                }
            }
        }
    } finally {
        checkAndClose(writer);
    }

    //sort lsr results
    final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted");
    SequenceFile.Sorter sorter = new SequenceFile.Sorter(jobfs, new Text.Comparator(), Text.class,
            FileStatus.class, jobconf);
    sorter.sort(dstlsr, sortedlsr);

    //compare lsr list and dst list  
    SequenceFile.Reader lsrin = null;
    SequenceFile.Reader dstin = null;
    try {
        lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf);
        dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf);

        //compare sorted lsr list and sorted dst list
        final Text lsrpath = new Text();
        final FileStatus lsrstatus = new FileStatus();
        final Text dstpath = new Text();
        final Text dstfrom = new Text();
        final FsShell shell = new FsShell(conf);
        final String[] shellargs = { "-rmr", null };

        boolean hasnext = dstin.next(dstpath, dstfrom);
        for (; lsrin.next(lsrpath, lsrstatus);) {
            int dst_cmp_lsr = dstpath.compareTo(lsrpath);
            for (; hasnext && dst_cmp_lsr < 0;) {
                hasnext = dstin.next(dstpath, dstfrom);
                dst_cmp_lsr = dstpath.compareTo(lsrpath);
            }

            if (dst_cmp_lsr == 0) {
                //lsrpath exists in dst, skip it
                hasnext = dstin.next(dstpath, dstfrom);
            } else {
                //lsrpath does not exist, delete it
                String s = new Path(dstroot.getPath(), lsrpath.toString()).toString();
                if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) {
                    shellargs[1] = s;
                    int r = 0;
                    try {
                        r = shell.run(shellargs);
                    } catch (Exception e) {
                        throw new IOException("Exception from shell.", e);
                    }
                    if (r != 0) {
                        throw new IOException(
                                "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r);
                    }
                }
            }
        }
    } finally {
        checkAndClose(lsrin);
        checkAndClose(dstin);
    }
}

From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java

License:Apache License

static Writer openWriter(final Configuration conf, final Path path, final String subdir, final Class<?> key,
        final Class<?> value) throws IOException {
    final FileSystem fs = HadoopFileUtils.getFileSystem(path);

    final String name = HadoopUtils.createRandomString(10);
    final Path stringsPath = new Path(path, subdir + "/" + name);
    return SequenceFile.createWriter(fs, conf, stringsPath, key, value, SequenceFile.CompressionType.RECORD);
}

From source file:org.oclc.firefly.hadoop.backup.Backup.java

License:Apache License

/**
 * Create mapper input files containing their paths to copy
 * @param mapperInput The list of files that the copy mappers should copy
 * @param numMapTasks The number of map tasks
 * @param fs The file system to write to
 * @param id The mapper id//from w  w w .  j  av a2s .  com
 * @return The list of input files for a a mapper
 * @throws IOException If we fail to create input files
 */
private List<Path> createMapperInputSequenceFiles(List<Pair<String, HRegionInfo>> mapperInput, int numMapTasks,
        FileSystem fs, int id) throws IOException {
    int idx = 0;
    List<Path> paths = new ArrayList<Path>();
    List<SequenceFile.Writer> writers = new ArrayList<SequenceFile.Writer>();
    String inputDir = getMapInputDirectory(id);

    // delete this directory if already exists
    fs.delete(new Path(inputDir), true);

    // each mapper gets an input file
    for (int i = 0; i < numMapTasks; i++) {
        // open the input file for writing
        Path mapInputFile = new Path(inputDir + "/mapper-input-" + i + ".txt");
        fs.delete(mapInputFile, false);

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, fs.getConf(), mapInputFile, Text.class,
                HRegionInfo.class, SequenceFile.CompressionType.NONE);

        LOG.debug("Mapper input: " + mapInputFile);

        paths.add(mapInputFile);
        writers.add(writer);
    }

    // Assign copy paths to mappers
    for (Pair<String, HRegionInfo> pair : mapperInput) {
        Text key = new Text(pair.getFirst());
        HRegionInfo value = new HRegionInfo(pair.getSecond());

        LOG.debug("Appending " + key + ", " + value.getEncodedName());
        writers.get(idx).append(key, value);

        idx++;
        if (idx >= writers.size()) {
            idx = 0;
        }
    }

    // close writers
    for (SequenceFile.Writer writer : writers) {
        try {
            writer.sync();
            writer.close();
        } catch (Exception e) {
            // nothing to do here
        }
    }

    return paths;
}

From source file:org.qcri.pca.FileFormat.java

public static void convertFromDenseToSeq(String inputPath, int cardinality, String outputFolderPath) {
    try {/*  www  .  jav a 2s. c  om*/
        final Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        SequenceFile.Writer writer;

        final IntWritable key = new IntWritable();
        final VectorWritable value = new VectorWritable();

        int lineNumber = 0;
        String thisLine;
        File[] filePathList = null;
        File inputFile = new File(inputPath);
        if (inputFile.isFile()) // if it is a file
        {
            filePathList = new File[1];
            filePathList[0] = inputFile;
        } else {
            filePathList = inputFile.listFiles();
        }
        if (filePathList == null) {
            log.error("The path " + inputPath + " does not exist");
            return;
        }
        for (File file : filePathList) {
            BufferedReader br = new BufferedReader(new FileReader(file));
            Vector vector = null;
            String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq";
            writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class,
                    VectorWritable.class, CompressionType.BLOCK);
            while ((thisLine = br.readLine()) != null) { // while loop begins here
                if (thisLine.isEmpty())
                    continue;
                String[] splitted = thisLine.split("\\s+");
                vector = new SequentialAccessSparseVector(splitted.length);
                for (int i = 0; i < splitted.length; i++) {
                    vector.set(i, Double.parseDouble(splitted[i]));
                }
                key.set(lineNumber);
                value.set(vector);
                //System.out.println(vector);
                writer.append(key, value);//write last row
                lineNumber++;
            }
            writer.close();
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:org.qcri.pca.FileFormat.java

public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) {
    try {//from ww  w . j av a 2  s.  c o m
        final Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        SequenceFile.Writer writer = null;

        final IntWritable key = new IntWritable();
        final VectorWritable value = new VectorWritable();

        Vector vector = null;

        String thisLine;
        int prevRowID = -1;
        boolean first = true;
        File[] filePathList = null;
        File inputFile = new File(inputPath);
        if (inputFile.isFile()) // if it is a file
        {
            filePathList = new File[1];
            filePathList[0] = inputFile;
        } else {
            filePathList = inputFile.listFiles();
        }
        if (filePathList == null) {
            log.error("The path " + inputPath + " does not exist");
            return;
        }
        for (File file : filePathList) {
            BufferedReader br = new BufferedReader(new FileReader(file));
            String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq";
            writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class,
                    VectorWritable.class, CompressionType.BLOCK);
            while ((thisLine = br.readLine()) != null) { // while loop begins here            
                String[] splitted = thisLine.split(",");
                int rowID = Integer.parseInt(splitted[0]);
                int colID = Integer.parseInt(splitted[1]);
                double element = Double.parseDouble(splitted[2]);
                if (first) {
                    first = false;
                    vector = new SequentialAccessSparseVector(cardinality);
                } else if (rowID != prevRowID) {
                    key.set(prevRowID);
                    value.set(vector);
                    //System.out.println(vector);
                    writer.append(key, value);//write last row
                    vector = new SequentialAccessSparseVector(cardinality);
                }
                prevRowID = rowID;
                vector.set(colID - base, element);
            }
            /*//here we append the last vector in each file (assuming that we will start a new row in the next file
            key.set(prevRowID);
            value.set(vector);
            //System.out.println("last vector");
            //System.out.println(vector);
            writer.append(key,value);//write last row
            writer.close();
            */
        }
        if (writer != null) //append last vector in last file
        {
            key.set(prevRowID);
            value.set(vector);
            //System.out.println("last vector");
            //System.out.println(vector);
            writer.append(key, value);//write last row
            writer.close();
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:org.qcri.sparkpca.FileFormat.java

public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) {
    try {/* w w  w  . ja  v  a  2  s. co  m*/
        final Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        SequenceFile.Writer writer = null;

        final IntWritable key = new IntWritable();
        final VectorWritable value = new VectorWritable();

        Vector vector = null;

        String thisLine;

        int lineNumber = 0;
        int prevRowID = -1;
        boolean first = true;
        File[] filePathList = null;
        File inputFile = new File(inputPath);
        if (inputFile.isFile()) // if it is a file
        {
            filePathList = new File[1];
            filePathList[0] = inputFile;
        } else {
            filePathList = inputFile.listFiles();
        }
        if (filePathList == null) {
            log.error("The path " + inputPath + " does not exist");
            return;
        }
        for (File file : filePathList) {
            BufferedReader br = new BufferedReader(new FileReader(file));
            String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq";
            writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class,
                    VectorWritable.class, CompressionType.BLOCK);
            while ((thisLine = br.readLine()) != null) { // while loop begins here            
                String[] splitted = thisLine.split(",");
                int rowID = Integer.parseInt(splitted[0]);
                int colID = Integer.parseInt(splitted[1]);
                double element = Double.parseDouble(splitted[2]);
                if (first) {
                    first = false;
                    vector = new SequentialAccessSparseVector(cardinality);
                } else if (rowID != prevRowID) {
                    key.set(prevRowID);
                    value.set(vector);
                    //System.out.println(vector);
                    writer.append(key, value);//write last row
                    vector = new SequentialAccessSparseVector(cardinality);
                }
                prevRowID = rowID;
                vector.set(colID - base, element);
            }
        }
        if (writer != null) //append last vector in last file
        {
            key.set(prevRowID);
            value.set(vector);
            writer.append(key, value);//write last row
            writer.close();
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}