List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.zjy.mongo.splitter.BSONSplitter.java
License:Apache License
/** * Get the path to the ".splits" file for a BSON file. * @param filePath the path to the BSON file. * @param conf the Hadoop configuration. * @return the path to the ".splits" file. *//*from w w w .j a v a2s . c om*/ public static Path getSplitsFilePath(final Path filePath, final Configuration conf) { String splitsPath = MongoConfigUtil.getBSONSplitsPath(conf); String splitsFileName = "." + filePath.getName() + ".splits"; if (null == splitsPath) { return new Path(filePath.getParent(), splitsFileName); } return new Path(splitsPath, splitsFileName); }
From source file:contrail.correct.CorrectUtil.java
License:Apache License
/** * Gets the Distributed cache path of a given file named binary * @param binary: Name of the file you are looking for * @param job: The JobConf object//from www.j a v a 2s. co m * @return */ public String getDcachePath(String binary, JobConf job) { Path[] dcacheFiles; String path = ""; try { dcacheFiles = DistributedCache.getLocalCacheFiles(job); if (null != dcacheFiles && dcacheFiles.length > 0) { for (Path cachePath : dcacheFiles) { if (cachePath.getName().equals(binary)) { path = cachePath.toString(); } } } } catch (IOException e) { sLogger.error(e.getStackTrace()); } return path; }
From source file:contrail.util.FileHelper.java
License:Open Source License
/** * Function moves the contents of old_path into new_path. This is used * to save the final graph./* w w w. j a v a 2 s. co m*/ * @param oldPath * @param newPath */ static public void moveDirectoryContents(Configuration conf, String oldPath, String newPath) { // We can't invoke rename directly on old path because it ends up // making old_path a subdirectory of new_path. FileSystem fs = null; try { fs = FileSystem.get(conf); } catch (IOException e) { throw new RuntimeException("Can't get filesystem: " + e.getMessage()); } try { Path oldPathObject = new Path(oldPath); for (FileStatus status : fs.listStatus(oldPathObject)) { Path oldFile = status.getPath(); Path newFile = new Path(newPath, oldFile.getName()); fs.rename(oldFile, newFile); } } catch (IOException e) { throw new RuntimeException("Problem moving the files: " + e.getMessage()); } }
From source file:cs480a2.yqiu.recSystem.mapreduce.input.SingleBookReader.java
/** * @param inputSplit//from www .j a v a 2 s.c o m * @param context the information about the task * @throws IOException * @throws InterruptedException */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration configuration = context.getConfiguration(); Path path = split.getPath(); filename = path.getName(); FileSystem fileSystem = path.getFileSystem(configuration); FSDataInputStream inputStream = fileSystem.open(path); lineReader = new LineReader(inputStream, configuration); //initial start point and end point start = split.getStart(); end = start + split.getLength(); inputStream.seek(start); if (start != 0) { start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } start += lineReader.readLine(currentLine); prepareToScanBook(); }
From source file:datafu.hourglass.jobs.AbstractPartitionPreservingIncrementalJob.java
License:Apache License
/** * Execute the job.//from w w w. j a va 2 s . c o m * * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private void execute() throws IOException, InterruptedException, ClassNotFoundException { int iterations = 0; while (true) { PartitionPreservingExecutionPlanner planner = new PartitionPreservingExecutionPlanner(getFileSystem(), getProperties()); planner.setInputPaths(getInputPaths()); planner.setOutputPath(getOutputPath()); planner.setStartDate(getStartDate()); planner.setEndDate(getEndDate()); planner.setDaysAgo(getDaysAgo()); planner.setNumDays(getNumDays()); planner.setMaxToProcess(getMaxToProcess()); planner.setFailOnMissing(isFailOnMissing()); planner.createPlan(); if (planner.getInputsToProcess().size() == 0) { _log.info("Found all necessary incremental data"); break; } if (iterations >= getMaxIterations()) { throw new RuntimeException(String.format( "Already completed %d iterations but the max is %d and there are still %d inputs to process", iterations, getMaxIterations(), planner.getInputsToProcess().size())); } Path jobTempPath = createRandomTempPath(); _garbage.add(jobTempPath); ensurePath(getOutputPath()); Path incrementalStagingPath = ensurePath(new Path(jobTempPath, ".incremental-staging")); Path incrementalStagingTmpPath = ensurePath(new Path(jobTempPath, ".incremental-staging-tmp")); Report report = new Report(); // create input paths for job List<String> inputPaths = new ArrayList<String>(); for (DatePath input : planner.getInputsToProcess()) { inputPaths.add(input.getPath().toString()); report.inputFiles.add(input); } _log.info("Staging path: " + incrementalStagingPath); final StagedOutputJob job = StagedOutputJob.createStagedJob(getConf(), getName() + "-" + "incremental", inputPaths, incrementalStagingTmpPath.toString(), incrementalStagingPath.toString(), _log); job.setCountersParentPath(getCountersParentPath()); final Configuration conf = job.getConfiguration(); config(conf); PartitionPreservingSchemas fpSchemas = new PartitionPreservingSchemas(getSchemas(), planner.getInputSchemasByPath(), getOutputSchemaName(), getOutputSchemaNamespace()); job.setInputFormatClass(AvroMultipleInputsKeyInputFormat.class); job.setOutputFormatClass(AvroKeyOutputFormat.class); _log.info("Setting input path to schema mappings"); for (String path : fpSchemas.getMapInputSchemas().keySet()) { Schema schema = fpSchemas.getMapInputSchemas().get(path); _log.info("*** " + path); _log.info("*** => " + schema.toString()); AvroMultipleInputsUtil.setInputKeySchemaForPath(job, schema, path); } AvroJob.setMapOutputKeySchema(job, fpSchemas.getMapOutputKeySchema()); AvroJob.setMapOutputValueSchema(job, fpSchemas.getMapOutputValueSchema()); AvroJob.setOutputKeySchema(job, fpSchemas.getReduceOutputSchema()); StringBuilder inputTimesJoined = new StringBuilder(); for (Date input : planner.getDatesToProcess()) { String namedOutput = PathUtils.datedPathFormat.format(input); _log.info(String.format("Adding named output %s", namedOutput)); AvroMultipleOutputs.addNamedOutput(job, namedOutput, AvroKeyOutputFormat.class, fpSchemas.getReduceOutputSchema()); inputTimesJoined.append(Long.toString(input.getTime())); inputTimesJoined.append(","); } int numReducers; if (getNumReducers() != null) { numReducers = getNumReducers(); _log.info(String.format("Using %d reducers (fixed)", numReducers)); } else { numReducers = planner.getNumReducers(); _log.info(String.format("Using %d reducers (computed)", numReducers)); } int avgReducersPerInput = (int) Math.ceil(numReducers / (double) planner.getDatesToProcess().size()); _log.info(String.format("Reducers per input path: %d", avgReducersPerInput)); // counters for multiple outputs // conf.set("mo.counters", "true"); conf.set(TimePartitioner.REDUCERS_PER_INPUT, Integer.toString(avgReducersPerInput)); conf.set(TimePartitioner.INPUT_TIMES, inputTimesJoined.substring(0, inputTimesJoined.length() - 1)); job.setNumReduceTasks(numReducers); Path mapperPath = new Path(incrementalStagingPath, ".mapper_impl"); Path reducerPath = new Path(incrementalStagingPath, ".reducer_impl"); Path combinerPath = new Path(incrementalStagingPath, ".combiner_impl"); conf.set(Parameters.REDUCER_IMPL_PATH, reducerPath.toString()); conf.set(Parameters.MAPPER_IMPL_PATH, mapperPath.toString()); _mapper = new PartitioningMapper(); _mapper.setSchemas(fpSchemas); _mapper.setMapper(getMapper()); _reducer = new PartitioningReducer(); _reducer.setSchemas(fpSchemas); _reducer.setAccumulator(getReducerAccumulator()); DistributedCacheHelper.writeObject(conf, getMapProcessor(), mapperPath); DistributedCacheHelper.writeObject(conf, getReduceProcessor(), reducerPath); job.setMapperClass(DelegatingMapper.class); job.setReducerClass(DelegatingReducer.class); if (isUseCombiner()) { _combiner = new PartitioningCombiner(); _combiner.setAccumulator(getCombinerAccumulator()); conf.set(Parameters.COMBINER_IMPL_PATH, combinerPath.toString()); job.setCombinerClass(DelegatingCombiner.class); DistributedCacheHelper.writeObject(conf, getCombineProcessor(), combinerPath); } job.setPartitionerClass(TimePartitioner.class); if (!job.waitForCompletion(true)) { _log.error("Job failed! Quitting..."); throw new RuntimeException("Job failed"); } report.jobName = job.getJobName(); report.jobId = job.getJobID().toString(); moveStagedFiles(report, incrementalStagingPath); if (getCountersParentPath() == null) { // save the counters in the target path, for lack of a better place to put it Path counters = job.getCountersPath(); if (getFileSystem().exists(counters)) { Path target = new Path(getOutputPath(), counters.getName()); if (getFileSystem().exists(target)) { _log.info(String.format("Removing old counters at %s", target)); getFileSystem().delete(target, true); } _log.info(String.format("Moving %s to %s", counters.getName(), getOutputPath())); getFileSystem().rename(counters, target); report.countersPath = target; } else { _log.error("Could not find counters at " + counters); } } applyRetention(); _reports.add(report); if (!planner.getNeedsAnotherPass()) { break; } cleanup(); iterations++; } }
From source file:datafu.hourglass.jobs.AbstractPartitionPreservingIncrementalJob.java
License:Apache License
/** * Moves files from the staging path to the final output path. * /* ww w .ja v a2s . c om*/ * @param report report to update with output paths * @param sourcePath source of data to move * @throws IOException */ private void moveStagedFiles(Report report, Path sourcePath) throws IOException { _log.info("Following files produced in staging path:"); for (FileStatus stat : getFileSystem().globStatus(new Path(sourcePath, "*.avro"))) { _log.info(String.format("* %s (%d bytes)", stat.getPath(), stat.getLen())); } FileStatus[] incrementalParts = getFileSystem().globStatus(new Path(sourcePath, "*"), new PathFilter() { @Override public boolean accept(Path path) { String[] pathParts = path.getName().split("-"); try { Long.parseLong(pathParts[0]); return true; } catch (NumberFormatException e) { return false; } } }); // collect the new incremental data from the temp folder and move to subfolders Map<String, Path> incrementalTargetPaths = new HashMap<String, Path>(); for (FileStatus stat : incrementalParts) { String[] pathParts = stat.getPath().getName().split("-"); try { String timestamp = pathParts[0]; if (!incrementalTargetPaths.containsKey(timestamp)) { Path parent = new Path(sourcePath, timestamp); if (!getFileSystem().exists(parent)) { getFileSystem().mkdirs(parent); } else { throw new RuntimeException("already exists: " + parent.toString()); } incrementalTargetPaths.put(timestamp, parent); } Path parent = incrementalTargetPaths.get(timestamp); _log.info(String.format("Moving %s to %s", stat.getPath().getName(), parent.toString())); getFileSystem().rename(stat.getPath(), new Path(parent, stat.getPath().getName())); } catch (NumberFormatException e) { throw new RuntimeException(e); } } for (Path src : incrementalTargetPaths.values()) { Date srcDate; try { srcDate = PathUtils.datedPathFormat.parse(src.getName()); } catch (ParseException e) { throw new RuntimeException(e); } Path target = new Path(getOutputPath(), PathUtils.nestedDatedPathFormat.format(srcDate)); _log.info(String.format("Moving %s to %s", src.getName(), target)); getFileSystem().mkdirs(target.getParent()); if (!getFileSystem().rename(src, target)) { throw new RuntimeException("Failed to rename " + src + " to " + target); } report.outputFiles.add(new DatePath(srcDate, target)); } }
From source file:de.gesundkrank.wikipedia.hadoop.inputformat.WikiInputFormat.java
License:Open Source License
@Override protected boolean isSplitable(JobContext context, Path filename) { return !filename.getName().endsWith(".bz2"); }
From source file:de.huberlin.wbi.hiway.common.Client.java
License:Apache License
/** * Main run function for the client./*w w w.ja va 2 s. co m*/ * * @return true if application completed successfully. */ private boolean run() throws IOException, YarnException { /* log */ System.out.println("Running Client"); yarnClient.start(); YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics(); /* log */ System.out.println( "Got Cluster metric info from ASM" + ", numNodeManagers=" + clusterMetrics.getNumNodeManagers()); List<NodeReport> clusterNodeReports = yarnClient.getNodeReports(NodeState.RUNNING); /* log */ System.out.println("Got Cluster node info from ASM"); /* log */ for (NodeReport node : clusterNodeReports) System.out.println("Got node report from ASM for" + ", nodeId=" + node.getNodeId() + ", nodeAddress" + node.getHttpAddress() + ", nodeRackName" + node.getRackName() + ", nodeNumContainers" + node.getNumContainers()); QueueInfo queueInfo = yarnClient.getQueueInfo(this.amQueue); /* log */ System.out.println("Queue info" + ", queueName=" + queueInfo.getQueueName() + ", queueCurrentCapacity=" + queueInfo.getCurrentCapacity() + ", queueMaxCapacity=" + queueInfo.getMaximumCapacity() + ", queueApplicationCount=" + queueInfo.getApplications().size() + ", queueChildQueueCount=" + queueInfo.getChildQueues().size()); List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo(); /* log */ for (QueueUserACLInfo aclInfo : listAclInfo) for (QueueACL userAcl : aclInfo.getUserAcls()) System.out.println("User ACL Info for Queue" + ", queueName=" + aclInfo.getQueueName() + ", userAcl=" + userAcl.name()); // Get a new application id YarnClientApplication app = yarnClient.createApplication(); GetNewApplicationResponse appResponse = app.getNewApplicationResponse(); // Get min/max resource capabilities from RM and change memory ask if needed int maxVC = appResponse.getMaximumResourceCapability().getVirtualCores(); /* log */ System.out.println("Max vCores capabililty of resources in this cluster " + maxVC); int maxMem = appResponse.getMaximumResourceCapability().getMemory(); /* log */ System.out.println("Max mem capabililty of resources in this cluster " + maxMem); // A resource ask cannot exceed the max. if (amVCores > maxVC) { /* log */ System.out.println("AM vCores specified above max threshold of cluster. Using max value." + ", specified=" + amVCores + ", max=" + maxVC); amVCores = maxVC; } if (amMemory > maxMem) { /* log */ System.out.println("AM memory specified above max threshold of cluster. Using max value." + ", specified=" + amMemory + ", max=" + maxMem); amMemory = maxMem; } // set the application name ApplicationSubmissionContext appContext = app.getApplicationSubmissionContext(); appContext.setApplicationType(conf.get(HiWayConfiguration.HIWAY_AM_APPLICATION_TYPE, HiWayConfiguration.HIWAY_AM_APPLICATION_TYPE_DEFAULT)); appContext.setApplicationName("run " + workflowParam + " (type: " + workflowType.toString() + ")"); ApplicationId appId = appContext.getApplicationId(); String hdfsBaseDirectoryName = conf.get(HiWayConfiguration.HIWAY_AM_DIRECTORY_BASE, HiWayConfiguration.HIWAY_AM_DIRECTORY_BASE_DEFAULT); String hdfsSandboxDirectoryName = conf.get(HiWayConfiguration.HIWAY_AM_DIRECTORY_CACHE, HiWayConfiguration.HIWAY_AM_DIRECTORY_CACHE_DEFAULT); Path hdfsBaseDirectory = new Path(new Path(hdfs.getUri()), hdfsBaseDirectoryName); Data.setHdfsBaseDirectory(hdfsBaseDirectory); Path hdfsSandboxDirectory = new Path(hdfsBaseDirectory, hdfsSandboxDirectoryName); Path hdfsApplicationDirectory = new Path(hdfsSandboxDirectory, appId.toString()); Data.setHdfsApplicationDirectory(hdfsApplicationDirectory); Data.setHdfs(hdfs); Path wfSource, wfDest, wfTemp = null; try { wfSource = new Path(new URI(workflowParam).getPath()); } catch (URISyntaxException e) { wfSource = new Path(workflowParam); } wfDest = new Path(hdfsApplicationDirectory + "/" + wfSource.getName()); // (1) if workflow file in hdfs, then transfer to temp file in local fs if (hdfs.exists(wfSource)) { wfTemp = new Path("./." + wfSource.getName()); System.out.println("Workflow found in HDFS at location " + wfSource); hdfs.copyToLocalFile(false, wfSource, wfTemp); } // (2) if galaxy workflow, then copy and replace input ports if (workflowType.equals(HiWayConfiguration.HIWAY_WORKFLOW_LANGUAGE_OPTS.galaxy)) { wfTemp = preProcessGalaxyWorkflow(wfSource, wfTemp); } if (wfTemp != null) { hdfs.copyFromLocalFile(wfTemp, wfDest); new File(wfTemp.toString()).delete(); } else { hdfs.copyFromLocalFile(wfSource, wfDest); } if (summaryPath != null) summary = new Data(summaryPath); if (customMemPath != null) (new Data(customMemPath)).stageOut(); // Set up the container launch context for the application master ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class); /* set the env variables to be setup in the env where the application master will be run */ System.out.println("Set the environment for the application master"); Map<String, String> env = new HashMap<>(); StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$()).append(File.pathSeparatorChar) .append("./*"); for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH)) { classPathEnv.append(':'); classPathEnv.append(File.pathSeparatorChar); classPathEnv.append(c.trim()); } if (conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) { classPathEnv.append(':'); classPathEnv.append(System.getProperty("java.class.path")); } env.put("CLASSPATH", classPathEnv.toString()); amContainer.setEnvironment(env); // Set the necessary command to execute the application master Vector<CharSequence> vargs = new Vector<>(30); // Set java executable command System.out.println("Setting up app master command"); vargs.add(Environment.JAVA_HOME.$() + "/bin/java"); if (HiWayConfiguration.debug) vargs.add( "-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=9010 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false"); // Set Xmx based on am memory size vargs.add("-Xmx" + amMemory + "m"); vargs.add("-Xss" + "16m"); // Set class name switch (workflowType) { case dax: vargs.add(HiWayConfiguration.HIWAY_WORKFLOW_LANGUAGE_DAX_AM_CLASS); break; case log: vargs.add(HiWayConfiguration.HIWAY_WORKFLOW_LANGUAGE_LOG_AM_CLASS); break; case galaxy: vargs.add(HiWayConfiguration.HIWAY_WORKFLOW_LANGUAGE_GALAXY_AM_CLASS); break; case cuneiformE: vargs.add(HiWayConfiguration.HIWAY_WORKFLOW_LANGUAGE_CUNEIFORME_AM_CLASS); break; default: vargs.add(HiWayConfiguration.HIWAY_WORKFLOW_LANGUAGE_CUNEIFORMJ_AM_CLASS); } vargs.add("--scheduler " + schedulerName.toString()); if (memory != null) vargs.add("--memory " + memory); if (summary != null) vargs.add("--summary " + summary.getName()); if (customMemPath != null) vargs.add("--custom " + customMemPath); vargs.add("--appid " + appId.toString()); if (HiWayConfiguration.debug) vargs.add("--debug"); if (HiWayConfiguration.verbose) vargs.add("--verbose"); vargs.add(workflowParam); vargs.add("> >(tee AppMaster.stdout " + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout)"); vargs.add("2> >(tee AppMaster.stderr " + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr >&2)"); // Get final command StringBuilder command = new StringBuilder(); for (CharSequence str : vargs) { command.append(str).append(" "); } System.out.println("Completed setting up app master command " + command.toString()); List<String> commands = new ArrayList<>(); commands.add(command.toString()); amContainer.setCommands(commands); // Set up resource type requirements Resource capability = Records.newRecord(Resource.class); capability.setVirtualCores(amVCores); capability.setMemory(amMemory); appContext.setResource(capability); // Setup security tokens if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } // For now, only getting tokens for the default file-system. final Token<?> tokens[] = hdfs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { System.out.println("Got dt for " + hdfs.getUri() + "; " + token); } } try (DataOutputBuffer dob = new DataOutputBuffer()) { credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } } appContext.setAMContainerSpec(amContainer); // Set the priority for the application master Priority pri = Records.newRecord(Priority.class); pri.setPriority(amPriority); appContext.setPriority(pri); // Set the queue to which this application is to be submitted in the RM appContext.setQueue(amQueue); // Submit the application to the applications manager /* log */ System.out.println("Submitting application to ASM"); yarnClient.submitApplication(appContext); // Monitor the application boolean success = monitorApplication(appId); if (success && summary != null) { summary.stageIn(); } return success; }
From source file:de.huberlin.wbi.hiway.common.Client.java
License:Apache License
/** * copy and replace input ports// ww w . j a v a 2s . c om */ private Path preProcessGalaxyWorkflow(Path wfSource, Path wfTemp) throws IOException { List<String> lines = new ArrayList<>(); try (BufferedReader reader = new BufferedReader( new FileReader(wfTemp == null ? wfSource.toString() : wfTemp.toString()))) { String line; while ((line = reader.readLine()) != null) { if (line.contains("\"name\": \"Input dataset\"")) { String inputLine = lines.get(lines.size() - 3); String portName = inputLine.substring(inputLine.indexOf("\"name\": \"") + 9, inputLine.lastIndexOf("\"")); System.out.println("Enter file location in HDFS for Galaxy workflow input port \"" + portName + "\". Press return or wait 30 seconds to use default value \"" + portName + "\"."); BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); long startTime = System.currentTimeMillis(); // wait 30s while ((System.currentTimeMillis() - startTime) < 30 * 1000 && !in.ready()) { } if (in.ready()) { String newPortName = in.readLine(); if (newPortName.length() > 0) { inputLine = inputLine.replace(portName, newPortName); lines.set(lines.size() - 3, inputLine); } } } lines.add(line); } } wfTemp = new Path("./." + wfSource.getName()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(wfTemp.toString()))) { for (String line : lines) { writer.write(line); writer.newLine(); } } return wfTemp; }
From source file:de.huberlin.wbi.hiway.common.Data.java
License:Apache License
private Data(Path localPath, String containerId) { this.localDirectory = localPath.getParent(); this.fileName = localPath.getName(); this.containerId = containerId; }