List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.ConvertDocumentIDToID.java
License:Apache License
private void loadDocumentIndex(String documentIndexPath) throws IOException { if (documentIndex == null) { documentIndex = new HashMap<String, Integer>(); Path p = new Path(documentIndexPath); FileSystem fs = FileSystem.get(p.toUri(), new Configuration()); int index = 0; for (FileStatus status : fs.listStatus(p)) { Path currPath = status.getPath(); if (!status.isDir() && !currPath.getName().startsWith("_")) { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(fs.open(currPath))); String line = null; while ((line = reader.readLine()) != null) { documentIndex.put(line.trim(), index++); }//from w w w. j a v a 2 s. c om } finally { if (reader != null) { reader.close(); } } } } log.info("Loaded document index with size: " + documentIndex.size()); } }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.ConvertFeatureToID.java
License:Apache License
private void loadFeatureIndex(String featureIndexPath) throws IOException { if (featureIndex == null) { featureIndex = new HashMap<String, Integer>(); Path p = new Path(featureIndexPath); FileSystem fs = FileSystem.get(p.toUri(), new Configuration()); int index = 0; for (FileStatus status : fs.listStatus(p)) { Path currPath = status.getPath(); if (!status.isDir() && !currPath.getName().startsWith("_")) { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(fs.open(currPath))); String line = null; while ((line = reader.readLine()) != null) { featureIndex.put(line.trim(), index++); }//from w w w . j av a 2s . c om } finally { if (reader != null) { reader.close(); } } } } log.info("Loaded feature index with size: " + featureIndex.size()); } }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.storage.MahoutVectorStorage.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked", "finally" }) @Override//from w w w .ja v a 2 s . c o m public void prepareToWrite(RecordWriter writer) throws IOException { if (dimensionPath != null) { Path p = new Path(dimensionPath); FileSystem fs = FileSystem.get(p.toUri(), new Configuration()); for (FileStatus status : fs.listStatus(p)) { Path currPath = status.getPath(); if (!status.isDir() && !currPath.getName().startsWith("_")) { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(fs.open(currPath))); String line = reader.readLine(); this.dimensions = Integer.parseInt(line); } catch (NumberFormatException nfe) { LOG.error("Unexpected input for dimensions", nfe); throw new IOException(); } finally { if (reader != null) { reader.close(); } // TODO: SMELLY: Why loop if we always cancel after the first file? break; } } } } this.writer = writer; }
From source file:com.mozilla.hadoop.fs.SequenceFileDirectoryReader.java
License:Apache License
public SequenceFileDirectoryReader(Path inputPath) throws IOException { fs = FileSystem.get(inputPath.toUri(), conf); paths = new ArrayList<Path>(); for (FileStatus status : fs.listStatus(inputPath)) { Path p = status.getPath(); if (!status.isDir() && !p.getName().startsWith("_")) { paths.add(p);//w w w .j a va 2 s . c o m } } pathIter = paths.iterator(); }
From source file:com.mozilla.hadoop.fs.TextFileDirectoryReader.java
License:Apache License
public TextFileDirectoryReader(Path inputPath) throws IOException { fs = FileSystem.get(inputPath.toUri(), conf); paths = new ArrayList<Path>(); for (FileStatus status : fs.listStatus(inputPath)) { Path p = status.getPath(); if (!status.isDir() && !p.getName().startsWith("_")) { paths.add(p);//w ww . jav a 2 s . c om } } pathIter = paths.iterator(); }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
/** * Opens a new part file./*from w w w .j av a2s.c om*/ * * <p> * This closes the old bucket file and retrieves a new bucket path from the {@code Bucketer}. */ private void openNewPartFile(Path bucketPath, BucketState<T> bucketState) throws Exception { closeCurrentPartFile(bucketState); FileSystem fs = new Path(basePath).getFileSystem(hadoopConf); if (!fs.exists(bucketPath)) { try { if (fs.mkdirs(bucketPath)) { LOG.debug("Created new bucket directory: {}", bucketPath); } } catch (IOException e) { throw new RuntimeException("Could not create new bucket path.", e); } } Path partPath = new Path(bucketPath, partPrefix + "-" + subtaskIndex + "-" + bucketState.partCounter + partSuffix); // This should work since there is only one parallel subtask that tries names with // our subtask id. Otherwise we would run into concurrency issues here. while (fs.exists(partPath) || fs .exists(new Path(partPath.getParent(), pendingPrefix + partPath.getName()).suffix(pendingSuffix))) { bucketState.partCounter++; partPath = new Path(bucketPath, partPrefix + "-" + subtaskIndex + "-" + bucketState.partCounter + partSuffix); } // increase, so we don't have to check for this name next time bucketState.partCounter++; LOG.debug("Next part path is {}", partPath.toString()); bucketState.currentFile = partPath.toString(); Path inProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName()) .suffix(inProgressSuffix); // If we don't already have a writer for this bucket, create one if (bucketState.writer == null) { bucketState.writer = writerTemplate.duplicate(); } bucketState.writer.open(fs, inProgressPath); bucketState.isWriterOpen = true; }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
/** * Closes the current part file./*w w w .j a v a 2 s. c o m*/ * * <p> * This moves the current in-progress part file to a pending file and adds it to the list * of pending files in our bucket state. */ private void closeCurrentPartFile(BucketState<T> bucketState) throws Exception { if (bucketState.isWriterOpen) { bucketState.writer.close(); bucketState.isWriterOpen = false; } if (bucketState.currentFile != null) { Path currentPartPath = new Path(bucketState.currentFile); Path inProgressPath = new Path(currentPartPath.getParent(), inProgressPrefix + currentPartPath.getName()).suffix(inProgressSuffix); Path pendingPath = new Path(currentPartPath.getParent(), pendingPrefix + currentPartPath.getName()) .suffix(pendingSuffix); FileSystem fs = inProgressPath.getFileSystem(hadoopConf); fs.rename(inProgressPath, pendingPath); LOG.debug("Moving in-progress bucket {} to pending file {}", inProgressPath, pendingPath); bucketState.pendingFiles.add(currentPartPath.toString()); bucketState.currentFile = null; } }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
@Override public void notifyCheckpointComplete(long checkpointId) throws Exception { synchronized (state.bucketStates) { Iterator<Map.Entry<String, BucketState<T>>> it = state.bucketStates.entrySet().iterator(); while (it.hasNext()) { BucketState<T> bucketState = it.next().getValue(); synchronized (bucketState.pendingFilesPerCheckpoint) { Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet(); Set<Long> checkpointsToRemove = new HashSet<>(); for (Long pastCheckpointId : pastCheckpointIds) { if (pastCheckpointId <= checkpointId) { LOG.debug("Moving pending files to final location for checkpoint {}", pastCheckpointId); // All the pending files are buckets that have been completed but are waiting to be renamed // to their final name for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) { Path finalPath = new Path(filename); Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName()).suffix(pendingSuffix); FileSystem fs = pendingPath.getFileSystem(hadoopConf); fs.rename(pendingPath, finalPath); LOG.debug(//from w w w. j a v a2s . c o m "Moving pending file {} to final location having completed checkpoint {}.", pendingPath, pastCheckpointId); } checkpointsToRemove.add(pastCheckpointId); } } if (!bucketState.isWriterOpen && bucketState.pendingFiles.isEmpty()) { // We've dealt with all the pending files and the writer for this bucket is not currently open. // Therefore this bucket is currently inactive and we can remove it from our state. it.remove(); } else { for (Long toRemove : checkpointsToRemove) { bucketState.pendingFilesPerCheckpoint.remove(toRemove); } } } } } }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
@Override public void restoreState(State<T> state) { this.state = state; FileSystem fs;/*from w w w.j a v a 2 s.c om*/ try { fs = new Path(basePath).getFileSystem(HadoopFileSystem.getHadoopConfiguration()); } catch (IOException e) { LOG.error("Error while creating FileSystem in checkpoint restore.", e); throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e); } for (BucketState<T> bucketState : state.bucketStates.values()) { // we can clean all the pending files since they where renamed to final files // after this checkpoint was successful bucketState.pendingFiles.clear(); if (bucketState.currentFile != null) { // We were writing to a file when the last checkpoint occured. This file can either // be still in-progress or became a pending file at some point after the checkpoint. // Either way, we have to truncate it back to a valid state (or write a .valid-length) // file that specifies up to which length it is valid and rename it to the final name // before starting a new bucket file. Path partPath = new Path(bucketState.currentFile); try { Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName()) .suffix(pendingSuffix); Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName()) .suffix(inProgressSuffix); if (fs.exists(partPendingPath)) { LOG.debug( "In-progress file {} has been moved to pending after checkpoint, moving to final location.", partPath); // has been moved to pending in the mean time, rename to final location fs.rename(partPendingPath, partPath); } else if (fs.exists(partInProgressPath)) { LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath); // it was still in progress, rename to final path fs.rename(partInProgressPath, partPath); } else if (fs.exists(partPath)) { LOG.debug("In-Progress file {} was already moved to final location {}.", bucketState.currentFile, partPath); } else { LOG.debug( "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, " + "it was moved to final location by a previous snapshot restore", bucketState.currentFile); } refTruncate = reflectTruncate(fs); // truncate it or write a ".valid-length" file to specify up to which point it is valid if (refTruncate != null) { LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength); // some-one else might still hold the lease from a previous try, we are // recovering, after all ... if (fs instanceof DistributedFileSystem) { DistributedFileSystem dfs = (DistributedFileSystem) fs; LOG.debug("Trying to recover file lease {}", partPath); dfs.recoverLease(partPath); boolean isclosed = dfs.isFileClosed(partPath); StopWatch sw = new StopWatch(); sw.start(); while (!isclosed) { if (sw.getTime() > asyncTimeout) { break; } try { Thread.sleep(500); } catch (InterruptedException e1) { // ignore it } isclosed = dfs.isFileClosed(partPath); } } Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath, bucketState.currentFileValidLength); if (!truncated) { LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath); // we must wait for the asynchronous truncate operation to complete StopWatch sw = new StopWatch(); sw.start(); long newLen = fs.getFileStatus(partPath).getLen(); while (newLen != bucketState.currentFileValidLength) { if (sw.getTime() > asyncTimeout) { break; } try { Thread.sleep(500); } catch (InterruptedException e1) { // ignore it } newLen = fs.getFileStatus(partPath).getLen(); } if (newLen != bucketState.currentFileValidLength) { throw new RuntimeException("Truncate did not truncate to right length. Should be " + bucketState.currentFileValidLength + " is " + newLen + "."); } } } else { LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath, bucketState.currentFileValidLength); Path validLengthFilePath = new Path(partPath.getParent(), validLengthPrefix + partPath.getName()).suffix(validLengthSuffix); if (!fs.exists(validLengthFilePath)) { FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath); lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength)); lengthFileOut.close(); } } // Now that we've restored the bucket to a valid state, reset the current file info bucketState.currentFile = null; bucketState.currentFileValidLength = -1; } catch (IOException e) { LOG.error("Error while restoring BucketingSink state.", e); throw new RuntimeException("Error while restoring BucketingSink state.", e); } catch (InvocationTargetException | IllegalAccessException e) { LOG.error("Cound not invoke truncate.", e); throw new RuntimeException("Could not invoke truncate.", e); } } LOG.debug("Clearing pending/in-progress files."); // Move files that are confirmed by a checkpoint but did not get moved to final location // because the checkpoint notification did not happen before a failure Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet(); LOG.debug("Moving pending files to final location on restore."); for (Long pastCheckpointId : pastCheckpointIds) { // All the pending files are buckets that have been completed but are waiting to be renamed // to their final name for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) { Path finalPath = new Path(filename); Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName()) .suffix(pendingSuffix); try { if (fs.exists(pendingPath)) { LOG.debug( "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.", pendingPath, pastCheckpointId); fs.rename(pendingPath, finalPath); } } catch (IOException e) { LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}", pendingPath, finalPath, e); throw new RuntimeException( "Error while renaming pending file " + pendingPath + " to final path " + finalPath, e); } } } synchronized (bucketState.pendingFilesPerCheckpoint) { bucketState.pendingFilesPerCheckpoint.clear(); } } // we need to get this here since open() has not yet been called int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask(); // delete pending files try { RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true); while (bucketFiles.hasNext()) { LocatedFileStatus file = bucketFiles.next(); if (file.getPath().toString().endsWith(pendingSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } if (file.getPath().toString().endsWith(inProgressSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } } } catch (IOException e) { LOG.error("Error while deleting old pending files: {}", e); throw new RuntimeException("Error while deleting old pending files.", e); } }
From source file:com.mvdb.platform.action.VersionMerge.java
License:Apache License
public static void main(String[] args) throws Exception { logger.error("error1"); logger.warn("warning1"); logger.info("info1"); logger.debug("debug1"); logger.trace("trace1"); ActionUtils.setUpInitFileProperty(); // LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory(); // StatusPrinter.print(lc); Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); //Also add lastMergedTimeStamp and mergeUptoTimestamp and passive db name which would be mv1 or mv2 if (otherArgs.length != 3) { System.err.println("Usage: versionmerge <customer-directory>"); System.exit(2);//from w w w. j a v a2s . c o m } //Example: file:/home/umesh/.mvdb/etl/data/alpha //Example: hdfs://localhost:9000/data/alpha String customerDirectory = otherArgs[0]; String lastMergedDirName = otherArgs[1]; String lastCopiedDirName = otherArgs[2]; org.apache.hadoop.conf.Configuration conf1 = new org.apache.hadoop.conf.Configuration(); //conf1.addResource(new Path("/home/umesh/ops/hadoop-1.2.0/conf/core-site.xml")); FileSystem hdfsFileSystem = FileSystem.get(conf1); Path topPath = new Path(customerDirectory); //Clean scratch db Path passiveDbPath = new Path(topPath, "db/mv1"); Path tempDbPath = new Path(topPath, "db/tmp-" + (int) (Math.random() * 100000)); if (hdfsFileSystem.exists(tempDbPath)) { boolean success = hdfsFileSystem.delete(tempDbPath, true); if (success == false) { System.err.println(String.format("Unable to delete temp directory %s", tempDbPath.toString())); System.exit(1); } } //last three parameters are hardcoded and the nulls must be replaced later after changing inout parameters. Path[] inputPaths = getInputPaths(hdfsFileSystem, topPath, lastMergedDirName, lastCopiedDirName, null); Set<String> tableNameSet = new HashSet<String>(); for (Path path : inputPaths) { tableNameSet.add(path.getName()); } Job job = new Job(conf, "versionmerge"); job.setJarByClass(VersionMerge.class); job.setMapperClass(VersionMergeMapper.class); job.setReducerClass(VersionMergeReducer.class); job.setMapOutputKeyClass(MergeKey.class); job.setMapOutputValueClass(BytesWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); String lastDirName = null; if (inputPaths != null && inputPaths.length > 1) { lastDirName = inputPaths[(inputPaths.length) - 2].getParent().getName(); } for (Path inputPath : inputPaths) { FileInputFormat.addInputPath(job, inputPath); } FileOutputFormat.setOutputPath(job, tempDbPath); for (String table : tableNameSet) { if (table.endsWith(".dat") == false) { continue; } table = table.replaceAll("-", ""); table = table.replaceAll(".dat", ""); MultipleOutputs.addNamedOutput(job, table, SequenceFileOutputFormat.class, Text.class, BytesWritable.class); } boolean success = job.waitForCompletion(true); System.out.println("Success:" + success); System.out.println(ManagementFactory.getRuntimeMXBean().getName()); if (success && lastDirName != null) { ActionUtils.setConfigurationValue(new Path(customerDirectory).getName(), ConfigurationKeys.LAST_MERGE_TO_MVDB_DIRNAME, lastDirName); } //hdfsFileSystem.delete(passiveDbPath, true); //hdfsFileSystem.rename(tempDbPath, passiveDbPath); System.exit(success ? 0 : 1); }