Example usage for org.apache.hadoop.tools DistCp execute

List of usage examples for org.apache.hadoop.tools DistCp execute

Introduction

In this page you can find the example usage for org.apache.hadoop.tools DistCp execute.

Prototype

public Job execute() throws Exception 

Source Link

Document

Implements the core-execution.

Usage

From source file:com.inmobi.databus.distcp.DistcpBaseService.java

License:Apache License

protected Boolean executeDistCp(DistCpOptions options) throws Exception {
    //Add Additional Default arguments to the array below which gets merged
    //with the arguments as sent in by the Derived Service
    Configuration conf = destCluster.getHadoopConf();
    DistCp distCp = new DistCp(conf, options);
    try {//from   ww  w  .  ja va2s  . c om
        distCp.execute();
    } catch (Exception e) {
        LOG.error("Exception encountered ", e);
        throw e;
    }
    return true;
}

From source file:com.pinterest.terrapin.hadoop.BaseUploader.java

License:Apache License

public void upload(String clusterName, String fileSet, Options options) throws Exception {
    List<Pair<Path, Long>> fileSizePairList = getFileList();

    int numShards = fileSizePairList.size();
    LOG.info("Got " + numShards + " files.");
    if (numShards == 0) {
        LOG.warn("No files found. Exiting.");
        System.exit(1);/*w ww  .j  av  a  2s . c  o m*/
    }

    List<Path> parts = Lists.transform(fileSizePairList, new Function<Pair<Path, Long>, Path>() {
        @Override
        public Path apply(Pair<Path, Long> pathLongPair) {
            return pathLongPair.getKey();
        }
    });
    PartitionerType partitionerType = options.getPartitioner();

    validate(parts, partitionerType, numShards);
    long maxSize = -1;
    for (Pair<Path, Long> fileSizePair : fileSizePairList) {
        long size = fileSizePair.getRight();
        if (maxSize < size) {
            maxSize = size;
        }
    }
    // Come up with a new timestamp epoch for the latest data.
    long timestampEpochMillis = System.currentTimeMillis();
    String hdfsDir = Constants.HDFS_DATA_DIR + "/" + fileSet + "/" + timestampEpochMillis;
    ZooKeeperManager zkManager = getZKManager(clusterName);
    FileSetInfo fileSetInfo = new FileSetInfo(fileSet, hdfsDir, numShards, (List) Lists.newArrayList(),
            options);

    int replicationFactor = Constants.DEFAULT_HDFS_REPLICATION;
    if (terrapinNamenode == null || terrapinNamenode.isEmpty()) {
        ClusterInfo info = zkManager.getClusterInfo();
        if (info == null) {
            LOG.error("Could not find the namenode for " + clusterName);
            System.exit(1);
        }
        if (info.hdfsNameNode == null || info.hdfsNameNode.isEmpty()) {
            LOG.error("Could not find the namenode for " + clusterName);
            System.exit(1);
        }
        this.terrapinNamenode = info.hdfsNameNode;
        replicationFactor = info.hdfsReplicationFactor;
    }
    // Connect to the zookeeper and establish a lock on the fileset.
    LOG.info("Locking fileset " + fileSet);
    zkManager.lockFileSet(fileSet, fileSetInfo);

    try {
        LOG.info("Uploading " + numShards + " files through distcp to " + hdfsDir);

        // TODO: Add check for cluster disk space.
        List<Path> sourceFiles = Lists.newArrayListWithCapacity(fileSizePairList.size());
        for (Pair<Path, Long> fileSize : fileSizePairList) {
            sourceFiles.add(fileSize.getLeft());
        }
        if (sourceFiles.size() == 1) {
            hdfsDir = hdfsDir + "/" + TerrapinUtil.formatPartitionName(0);
        }
        DistCpOptions distCpOptions = new DistCpOptions(sourceFiles,
                new Path("hdfs", terrapinNamenode, hdfsDir));
        distCpOptions.setSyncFolder(true);
        distCpOptions.setSkipCRC(true);

        if (maxSize > Constants.DEFAULT_MAX_SHARD_SIZE_BYTES) {
            LOG.warn("Largest shard is " + maxSize + " bytes. This is more than 4G. "
                    + "Increase the # of shards to reduce the size.");
            System.exit(1);
        }
        TerrapinUtil.setupConfiguration(conf, maxSize, replicationFactor);

        DistCp distCp = getDistCp(conf, distCpOptions);
        Job job = distCp.execute();
        if (!job.waitForCompletion(true)) {
            throw new RuntimeException("Distributed copy failed.");
        }

        LOG.info("Successfully copied data.");

        loadFileSetData(zkManager, fileSetInfo, options);

        // Wait for a while so that zookeeper watches have propagated before relinquishing the lock.
        try {
            LOG.info("Releasing file set lock.");
            Thread.sleep(5000);
        } catch (InterruptedException ie) {
            LOG.warn("Interrupted.");
        }
    } finally {
        zkManager.unlockFileSet(fileSet);
    }
}

From source file:com.thinkbiganalytics.nifi.v2.hdfs.DistCopyHDFS.java

License:Apache License

/**
 * onTrigger is called when the flow file proceeds through the processor
 *
 * @param context passed in by the framework and provides access to the data configured in the processor
 * @param session passed in by the framework and provides access to the flow file
 * @throws ProcessException if any framework actions fail
 *//*  w w w .j a v  a2 s  .c  o m*/
@Override
public void onTrigger(@Nonnull final ProcessContext context, @Nonnull final ProcessSession session)
        throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final FileSystem fs = getFileSystem(context);
    if (fs == null) {
        getLog().error("Couldn't initialize HDFS");
        session.transfer(flowFile, REL_FAILURE);
        return;
    }
    String filesJSON = context.getProperty(FILES).evaluateAttributeExpressions(flowFile).getValue();
    String source = context.getProperty(SOURCE).evaluateAttributeExpressions(flowFile).getValue();
    String destination = context.getProperty(DESTINATION).evaluateAttributeExpressions(flowFile).getValue();
    Gson jsonParser = new Gson();
    File[] filesList;
    ArrayList<Path> pathsList = new ArrayList<>();
    try {
        if (!(filesJSON == null) && !filesJSON.isEmpty()) {
            filesList = jsonParser.fromJson(filesJSON, File[].class);
            if (filesList == null) {
                filesList = new File[0];
            }
            if (source != null && !source.isEmpty()) {
                for (File f : filesList) {
                    pathsList.add(new Path(source, f.getName()));
                }
            } else {
                for (File f : filesList) {
                    pathsList.add(new Path(f.getName()));
                }
            }
        } else {
            if (source == null || source.isEmpty()) {
                getLog().error(String.format("At least one of attributes: %s or %s needs to be set",
                        SOURCE.getName(), FILES.getName()));

                session.transfer(flowFile, REL_FAILURE);
                return;
            }
            pathsList.add(new Path(source));
        }
        DistCp distCp = getDistCp(pathsList, new Path(destination));
        Job job = distCp.execute();
        job.waitForCompletion(false);
    } catch (JsonSyntaxException e) {
        getLog().error("Files list attribute does not contain a proper JSON array");
        session.transfer(flowFile, REL_FAILURE);
        return;
    } catch (Exception e) {
        getLog().error("Exception during processor execution: " + e.getMessage());
        session.transfer(flowFile, REL_FAILURE);
        return;
    }
    session.transfer(flowFile, REL_SUCCESS);
}

From source file:de.tiqsolutions.hdfs.HadoopFileSystemProvider.java

License:Apache License

private void remoteCopy(Path source, Path target, CopyOption... options) throws IOException {
    Configuration configuration = getConfiguration();
    Path tmp = target.getParent();
    Path dest = null;// www  .  j av a2 s  .c  om
    do {
        dest = tmp.resolve(String.format("tmp%s/", System.currentTimeMillis()));
    } while (Files.exists(dest));
    try {
        DistCpOptions distCpOptions = new DistCpOptions(
                Arrays.asList(((HadoopFileSystemPath) source).getPath()),
                ((HadoopFileSystemPath) dest).getPath());
        List<CopyOption> optionList = Arrays.asList(options);

        distCpOptions.setOverwrite(optionList.contains(StandardCopyOption.REPLACE_EXISTING));
        try {
            DistCp distCp = new DistCp(configuration, distCpOptions);
            Job job = distCp.execute();
            job.waitForCompletion(true);
        } catch (Exception e) {
            throw new IOException(e.getLocalizedMessage(), e);
        }
        move(dest.resolve(source.getFileName()), target, options);
    } finally {
        delete(dest, false);
    }

}

From source file:org.apache.falcon.hive.util.EventUtils.java

License:Apache License

public void invokeCopy() throws Exception {
    DistCpOptions options = getDistCpOptions();
    DistCp distCp = new DistCp(conf, options);
    LOG.info("Started DistCp with source Path: {} \ttarget path: {}", sourceStagingUri, targetStagingUri);

    Job distcpJob = distCp.execute();
    LOG.info("Distp Hadoop job: {}", distcpJob.getJobID().toString());
    LOG.info("Completed DistCp");
    if (distcpJob.getStatus().getState() == JobStatus.State.SUCCEEDED) {
        countersMap = HiveDRUtils.fetchReplicationCounters(conf, distcpJob);
    }//w ww.ja  v  a2  s  .  com
}

From source file:org.apache.falcon.replication.FeedReplicator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    CommandLine cmd = getCommand(args);//  w  w  w  .  j a v  a 2 s.c o m

    Configuration conf = this.getConf();
    // inject wf configs
    Path confPath = new Path("file:///" + System.getProperty("oozie.action.conf.xml"));

    LOG.info("{} found conf ? {}", confPath, confPath.getFileSystem(conf).exists(confPath));
    conf.addResource(confPath);

    String includePathConf = conf.get("falcon.include.path");
    final boolean includePathSet = (includePathConf != null) && !IGNORE.equalsIgnoreCase(includePathConf);

    DistCpOptions options = getDistCpOptions(cmd, includePathSet);

    String availabilityFlagOpt = cmd.getOptionValue("availabilityFlag");
    if (StringUtils.isEmpty(availabilityFlagOpt)) {
        availabilityFlagOpt = "NA";
    }
    String availabilityFlag = EntityUtil.SUCCEEDED_FILE_NAME;
    if (cmd.getOptionValue("falconFeedStorageType").equals(Storage.TYPE.FILESYSTEM.name())) {
        availabilityFlag = "NA".equals(availabilityFlagOpt) ? availabilityFlag : availabilityFlagOpt;
    }

    conf.set("falcon.feed.availability.flag", availabilityFlag);
    DistCp distCp = (includePathSet) ? new CustomReplicator(conf, options) : new DistCp(conf, options);
    LOG.info("Started DistCp with options :" + options);
    Job job = distCp.execute();

    if (cmd.hasOption("counterLogDir") && job.getStatus().getState() == JobStatus.State.SUCCEEDED) {
        LOG.info("Gathering counters for the the Feed Replication job");
        Path counterFile = new Path(cmd.getOptionValue("counterLogDir"), "counter.txt");
        JobCounters fsReplicationCounters = JobCountersHandler.getCountersType(JobType.FSREPLICATION.name());
        if (fsReplicationCounters != null) {
            fsReplicationCounters.obtainJobCounters(conf, job, true);
            fsReplicationCounters.storeJobCounters(conf, counterFile);
        }
    }

    if (includePathSet) {
        executePostProcessing(conf, options); // this only applies for FileSystem Storage.
    }

    LOG.info("Completed DistCp");
    return 0;
}

From source file:org.apache.falcon.snapshots.replication.HdfsSnapshotReplicator.java

License:Apache License

protected void invokeCopy(String sourceStorageUrl, String targetStorageUrl, DistributedFileSystem sourceFs,
        DistributedFileSystem targetFs, String sourceDir, String targetDir, String currentSnapshotName)
        throws FalconException {
    try {/* ww w .java 2 s. co m*/
        Configuration jobConf = this.getConf();
        DistCpOptions options = getDistCpOptions(sourceStorageUrl, targetStorageUrl, sourceFs, targetFs,
                sourceDir, targetDir, currentSnapshotName);
        DistCp distCp = new DistCp(jobConf, options);
        LOG.info("Started Snapshot based DistCp from {} to {} ", getStagingUri(sourceStorageUrl, sourceDir),
                getStagingUri(targetStorageUrl, targetDir));
        Job distcpJob = distCp.execute();
        LOG.info("Distp Hadoop job: {}", distcpJob.getJobID().toString());
        LOG.info("Completed Snapshot based DistCp");

    } catch (FalconException fe) {
        throw fe;
    } catch (Exception e) {
        throw new FalconException("Unable to replicate HDFS directory using snapshots.", e);
    }
}

From source file:org.apache.ivory.replication.FeedReplicator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    DistCpOptions options = getDistCpOptions(args);
    DistCp distCp = new CustomReplicator(this.getConf(), options);
    LOG.info("Started DistCp");
    distCp.execute();
    LOG.info("Completed DistCp");
    return 0;//w  ww.  ja v  a2  s.co  m

}