List of usage examples for org.apache.hadoop.tools DistCp execute
public Job execute() throws Exception
From source file:com.inmobi.databus.distcp.DistcpBaseService.java
License:Apache License
protected Boolean executeDistCp(DistCpOptions options) throws Exception { //Add Additional Default arguments to the array below which gets merged //with the arguments as sent in by the Derived Service Configuration conf = destCluster.getHadoopConf(); DistCp distCp = new DistCp(conf, options); try {//from ww w . ja va2s . c om distCp.execute(); } catch (Exception e) { LOG.error("Exception encountered ", e); throw e; } return true; }
From source file:com.pinterest.terrapin.hadoop.BaseUploader.java
License:Apache License
public void upload(String clusterName, String fileSet, Options options) throws Exception { List<Pair<Path, Long>> fileSizePairList = getFileList(); int numShards = fileSizePairList.size(); LOG.info("Got " + numShards + " files."); if (numShards == 0) { LOG.warn("No files found. Exiting."); System.exit(1);/*w ww .j av a 2s . c o m*/ } List<Path> parts = Lists.transform(fileSizePairList, new Function<Pair<Path, Long>, Path>() { @Override public Path apply(Pair<Path, Long> pathLongPair) { return pathLongPair.getKey(); } }); PartitionerType partitionerType = options.getPartitioner(); validate(parts, partitionerType, numShards); long maxSize = -1; for (Pair<Path, Long> fileSizePair : fileSizePairList) { long size = fileSizePair.getRight(); if (maxSize < size) { maxSize = size; } } // Come up with a new timestamp epoch for the latest data. long timestampEpochMillis = System.currentTimeMillis(); String hdfsDir = Constants.HDFS_DATA_DIR + "/" + fileSet + "/" + timestampEpochMillis; ZooKeeperManager zkManager = getZKManager(clusterName); FileSetInfo fileSetInfo = new FileSetInfo(fileSet, hdfsDir, numShards, (List) Lists.newArrayList(), options); int replicationFactor = Constants.DEFAULT_HDFS_REPLICATION; if (terrapinNamenode == null || terrapinNamenode.isEmpty()) { ClusterInfo info = zkManager.getClusterInfo(); if (info == null) { LOG.error("Could not find the namenode for " + clusterName); System.exit(1); } if (info.hdfsNameNode == null || info.hdfsNameNode.isEmpty()) { LOG.error("Could not find the namenode for " + clusterName); System.exit(1); } this.terrapinNamenode = info.hdfsNameNode; replicationFactor = info.hdfsReplicationFactor; } // Connect to the zookeeper and establish a lock on the fileset. LOG.info("Locking fileset " + fileSet); zkManager.lockFileSet(fileSet, fileSetInfo); try { LOG.info("Uploading " + numShards + " files through distcp to " + hdfsDir); // TODO: Add check for cluster disk space. List<Path> sourceFiles = Lists.newArrayListWithCapacity(fileSizePairList.size()); for (Pair<Path, Long> fileSize : fileSizePairList) { sourceFiles.add(fileSize.getLeft()); } if (sourceFiles.size() == 1) { hdfsDir = hdfsDir + "/" + TerrapinUtil.formatPartitionName(0); } DistCpOptions distCpOptions = new DistCpOptions(sourceFiles, new Path("hdfs", terrapinNamenode, hdfsDir)); distCpOptions.setSyncFolder(true); distCpOptions.setSkipCRC(true); if (maxSize > Constants.DEFAULT_MAX_SHARD_SIZE_BYTES) { LOG.warn("Largest shard is " + maxSize + " bytes. This is more than 4G. " + "Increase the # of shards to reduce the size."); System.exit(1); } TerrapinUtil.setupConfiguration(conf, maxSize, replicationFactor); DistCp distCp = getDistCp(conf, distCpOptions); Job job = distCp.execute(); if (!job.waitForCompletion(true)) { throw new RuntimeException("Distributed copy failed."); } LOG.info("Successfully copied data."); loadFileSetData(zkManager, fileSetInfo, options); // Wait for a while so that zookeeper watches have propagated before relinquishing the lock. try { LOG.info("Releasing file set lock."); Thread.sleep(5000); } catch (InterruptedException ie) { LOG.warn("Interrupted."); } } finally { zkManager.unlockFileSet(fileSet); } }
From source file:com.thinkbiganalytics.nifi.v2.hdfs.DistCopyHDFS.java
License:Apache License
/** * onTrigger is called when the flow file proceeds through the processor * * @param context passed in by the framework and provides access to the data configured in the processor * @param session passed in by the framework and provides access to the flow file * @throws ProcessException if any framework actions fail *//* w w w .j a v a2 s .c o m*/ @Override public void onTrigger(@Nonnull final ProcessContext context, @Nonnull final ProcessSession session) throws ProcessException { FlowFile flowFile = session.get(); if (flowFile == null) { return; } final FileSystem fs = getFileSystem(context); if (fs == null) { getLog().error("Couldn't initialize HDFS"); session.transfer(flowFile, REL_FAILURE); return; } String filesJSON = context.getProperty(FILES).evaluateAttributeExpressions(flowFile).getValue(); String source = context.getProperty(SOURCE).evaluateAttributeExpressions(flowFile).getValue(); String destination = context.getProperty(DESTINATION).evaluateAttributeExpressions(flowFile).getValue(); Gson jsonParser = new Gson(); File[] filesList; ArrayList<Path> pathsList = new ArrayList<>(); try { if (!(filesJSON == null) && !filesJSON.isEmpty()) { filesList = jsonParser.fromJson(filesJSON, File[].class); if (filesList == null) { filesList = new File[0]; } if (source != null && !source.isEmpty()) { for (File f : filesList) { pathsList.add(new Path(source, f.getName())); } } else { for (File f : filesList) { pathsList.add(new Path(f.getName())); } } } else { if (source == null || source.isEmpty()) { getLog().error(String.format("At least one of attributes: %s or %s needs to be set", SOURCE.getName(), FILES.getName())); session.transfer(flowFile, REL_FAILURE); return; } pathsList.add(new Path(source)); } DistCp distCp = getDistCp(pathsList, new Path(destination)); Job job = distCp.execute(); job.waitForCompletion(false); } catch (JsonSyntaxException e) { getLog().error("Files list attribute does not contain a proper JSON array"); session.transfer(flowFile, REL_FAILURE); return; } catch (Exception e) { getLog().error("Exception during processor execution: " + e.getMessage()); session.transfer(flowFile, REL_FAILURE); return; } session.transfer(flowFile, REL_SUCCESS); }
From source file:de.tiqsolutions.hdfs.HadoopFileSystemProvider.java
License:Apache License
private void remoteCopy(Path source, Path target, CopyOption... options) throws IOException { Configuration configuration = getConfiguration(); Path tmp = target.getParent(); Path dest = null;// www . j av a2 s .c om do { dest = tmp.resolve(String.format("tmp%s/", System.currentTimeMillis())); } while (Files.exists(dest)); try { DistCpOptions distCpOptions = new DistCpOptions( Arrays.asList(((HadoopFileSystemPath) source).getPath()), ((HadoopFileSystemPath) dest).getPath()); List<CopyOption> optionList = Arrays.asList(options); distCpOptions.setOverwrite(optionList.contains(StandardCopyOption.REPLACE_EXISTING)); try { DistCp distCp = new DistCp(configuration, distCpOptions); Job job = distCp.execute(); job.waitForCompletion(true); } catch (Exception e) { throw new IOException(e.getLocalizedMessage(), e); } move(dest.resolve(source.getFileName()), target, options); } finally { delete(dest, false); } }
From source file:org.apache.falcon.hive.util.EventUtils.java
License:Apache License
public void invokeCopy() throws Exception { DistCpOptions options = getDistCpOptions(); DistCp distCp = new DistCp(conf, options); LOG.info("Started DistCp with source Path: {} \ttarget path: {}", sourceStagingUri, targetStagingUri); Job distcpJob = distCp.execute(); LOG.info("Distp Hadoop job: {}", distcpJob.getJobID().toString()); LOG.info("Completed DistCp"); if (distcpJob.getStatus().getState() == JobStatus.State.SUCCEEDED) { countersMap = HiveDRUtils.fetchReplicationCounters(conf, distcpJob); }//w ww.ja v a2 s . com }
From source file:org.apache.falcon.replication.FeedReplicator.java
License:Apache License
@Override public int run(String[] args) throws Exception { CommandLine cmd = getCommand(args);// w w w . j a v a 2 s.c o m Configuration conf = this.getConf(); // inject wf configs Path confPath = new Path("file:///" + System.getProperty("oozie.action.conf.xml")); LOG.info("{} found conf ? {}", confPath, confPath.getFileSystem(conf).exists(confPath)); conf.addResource(confPath); String includePathConf = conf.get("falcon.include.path"); final boolean includePathSet = (includePathConf != null) && !IGNORE.equalsIgnoreCase(includePathConf); DistCpOptions options = getDistCpOptions(cmd, includePathSet); String availabilityFlagOpt = cmd.getOptionValue("availabilityFlag"); if (StringUtils.isEmpty(availabilityFlagOpt)) { availabilityFlagOpt = "NA"; } String availabilityFlag = EntityUtil.SUCCEEDED_FILE_NAME; if (cmd.getOptionValue("falconFeedStorageType").equals(Storage.TYPE.FILESYSTEM.name())) { availabilityFlag = "NA".equals(availabilityFlagOpt) ? availabilityFlag : availabilityFlagOpt; } conf.set("falcon.feed.availability.flag", availabilityFlag); DistCp distCp = (includePathSet) ? new CustomReplicator(conf, options) : new DistCp(conf, options); LOG.info("Started DistCp with options :" + options); Job job = distCp.execute(); if (cmd.hasOption("counterLogDir") && job.getStatus().getState() == JobStatus.State.SUCCEEDED) { LOG.info("Gathering counters for the the Feed Replication job"); Path counterFile = new Path(cmd.getOptionValue("counterLogDir"), "counter.txt"); JobCounters fsReplicationCounters = JobCountersHandler.getCountersType(JobType.FSREPLICATION.name()); if (fsReplicationCounters != null) { fsReplicationCounters.obtainJobCounters(conf, job, true); fsReplicationCounters.storeJobCounters(conf, counterFile); } } if (includePathSet) { executePostProcessing(conf, options); // this only applies for FileSystem Storage. } LOG.info("Completed DistCp"); return 0; }
From source file:org.apache.falcon.snapshots.replication.HdfsSnapshotReplicator.java
License:Apache License
protected void invokeCopy(String sourceStorageUrl, String targetStorageUrl, DistributedFileSystem sourceFs, DistributedFileSystem targetFs, String sourceDir, String targetDir, String currentSnapshotName) throws FalconException { try {/* ww w .java 2 s. co m*/ Configuration jobConf = this.getConf(); DistCpOptions options = getDistCpOptions(sourceStorageUrl, targetStorageUrl, sourceFs, targetFs, sourceDir, targetDir, currentSnapshotName); DistCp distCp = new DistCp(jobConf, options); LOG.info("Started Snapshot based DistCp from {} to {} ", getStagingUri(sourceStorageUrl, sourceDir), getStagingUri(targetStorageUrl, targetDir)); Job distcpJob = distCp.execute(); LOG.info("Distp Hadoop job: {}", distcpJob.getJobID().toString()); LOG.info("Completed Snapshot based DistCp"); } catch (FalconException fe) { throw fe; } catch (Exception e) { throw new FalconException("Unable to replicate HDFS directory using snapshots.", e); } }
From source file:org.apache.ivory.replication.FeedReplicator.java
License:Apache License
@Override public int run(String[] args) throws Exception { DistCpOptions options = getDistCpOptions(args); DistCp distCp = new CustomReplicator(this.getConf(), options); LOG.info("Started DistCp"); distCp.execute(); LOG.info("Completed DistCp"); return 0;//w ww. ja v a2 s.co m }