List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.ddp.SimpleREST.java
License:Open Source License
private Map<String, FileUpload> getUploadedFiles(RoutingContext ctx) { // any number of uploads Map<String, FileUpload> files = new HashMap<>(); for (FileUpload f : ctx.fileUploads()) { // do whatever you need to do with the file (it is already saved // on the directory you wanted... try {/* w w w. j a v a2s. co m*/ Path p = new Path(f.uploadedFileName()); fs.copyFromLocalFile(p, new Path(hdfsUploadHome)); files.put(hdfsUploadHome + "/" + p.getName(), f); } catch (IOException e) { e.printStackTrace(); } } return files; }
From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java
License:Apache License
private void generateXMLdocs(String inputf, String outputf) throws IOException { Path input = new Path(inputf); File output = new File(outputf); if (output.exists() && output.isFile()) { System.err.println("Output " + outputf + " already exists"); return;/*from w ww .j ava2 s. com*/ } if (output.exists() == false) output.mkdirs(); FileSystem fs = input.getFileSystem(getConf()); FileStatus[] statuses = fs.listStatus(input); int count[] = { 0 }; for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; Path suPath = status.getPath(); if (suPath.getName().equals("_SUCCESS")) continue; generateXMLdocs(suPath, output, count); } }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("v", "vector", true, "input vector sequencefile"); options.addOption("l", "label", true, "input vector sequencefile"); options.addOption("o", "output", true, "output Behemoth corpus"); // parse the command line arguments CommandLine line = null;// w w w . ja v a 2s .com try { line = parser.parse(options, args); if (line.hasOption("help")) { formatter.printHelp("CorpusGenerator", options); return 0; } if (!line.hasOption("v") | !line.hasOption("o") | !line.hasOption("l")) { formatter.printHelp("CorpusGenerator", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusGenerator", options); } Path vectorPath = new Path(line.getOptionValue("v")); Path labelPath = new Path(line.getOptionValue("l")); String output = line.getOptionValue("o"); Path tempOutput = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis()); // extracts the string representations from the vectors int retVal = vectorToString(vectorPath, tempOutput); if (retVal != 0) { HadoopUtil.delete(getConf(), tempOutput); return retVal; } Path tempOutput2 = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis()); retVal = convert(tempOutput, labelPath, tempOutput2); // delete the temp output HadoopUtil.delete(getConf(), tempOutput); if (retVal != 0) { HadoopUtil.delete(getConf(), tempOutput2); return retVal; } // convert tempOutput to standard file BufferedWriter bow = new BufferedWriter(new FileWriter(new File(output))); // the label dictionary is not dumped to text int labelMaxIndex = 0; Map<String, Integer> labelIndex = new HashMap<String, Integer>(); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); FileStatus[] fss = fs.listStatus(tempOutput2); try { for (FileStatus status : fss) { Path path = status.getPath(); // skips the _log or _SUCCESS files if (!path.getName().startsWith("part-") && !path.getName().equals(tempOutput2.getName())) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // read the key + values in that file Text key = new Text(); Text value = new Text(); while (reader.next(key, value)) { String label = key.toString(); // replace the label by its index Integer indexLabel = labelIndex.get(label); if (indexLabel == null) { indexLabel = new Integer(labelMaxIndex); labelIndex.put(label, indexLabel); labelMaxIndex++; } String val = value.toString(); bow.append(indexLabel.toString()).append(val).append("\n"); } reader.close(); } bow.flush(); } catch (Exception e) { e.printStackTrace(); return -1; } finally { bow.close(); fs.delete(tempOutput2, true); } return 0; }
From source file:com.digitalpebble.behemoth.util.ContentExtractor.java
License:Apache License
private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException { Path input = new Path(inputf); Path dirPath = new Path(outputf); FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf()); if (fsout.exists(dirPath) == false) fsout.mkdirs(dirPath);/* w ww.j av a 2 s .co m*/ else { System.err.println("Output " + outputf + " already exists"); return -1; } // index file Path indexPath = new Path(dirPath, "index"); if (fsout.exists(indexPath) == false) { fsout.createNewFile(indexPath); } maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000); index = fsout.create(indexPath); createArchive(dirPath); FileSystem fs = input.getFileSystem(getConf()); FileStatus[] statuses = fs.listStatus(input); int count[] = { 0 }; for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; Path suPath = status.getPath(); if (suPath.getName().equals("_SUCCESS")) continue; generateDocs(suPath, dirPath, count); } if (index != null) index.close(); if (currentArchive != null) { currentArchive.finish(); currentArchive.close(); } return 0; }
From source file:com.digitalpebble.behemoth.util.CorpusReader.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("c", "displayContent", false, "display binary content in output"); options.addOption("t", "displayText", false, "display text in output"); options.addOption("a", "displayAnnotations", false, "display annotations in output"); options.addOption("m", "displayMetadata", false, "display metadata in output"); // parse the command line arguments CommandLine line = null;//from ww w.ja v a2 s . c om try { line = parser.parse(options, args); String input = line.getOptionValue("i"); if (line.hasOption("help")) { formatter.printHelp("CorpusReader", options); return 0; } if (input == null) { formatter.printHelp("CorpusReader", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusReader", options); return -1; } boolean showBinaryContent = line.hasOption("displayContent"); boolean showText = line.hasOption("displayText"); boolean showAnnotations = line.hasOption("displayAnnotations"); boolean showMD = line.hasOption("displayMetadata"); Path inputPath = new Path(line.getOptionValue("i")); Configuration conf = getConf(); FileSystem fs = inputPath.getFileSystem(conf); // filter input DocumentFilter filters = DocumentFilter.getFilters(conf); boolean doFilter = DocumentFilter.isRequired(conf); FileStatus[] fss = fs.listStatus(inputPath); for (FileStatus status : fss) { Path path = status.getPath(); // skips the _log or _SUCCESS files if (!path.getName().startsWith("part-") && !path.getName().equals(inputPath.getName())) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); Text key = new Text(); BehemothDocument value = new BehemothDocument(); while (reader.next(key, value)) { // skip this document? if (doFilter && filters.keep(value) == false) continue; System.out.println(value.toString(showBinaryContent, showAnnotations, showText, showMD)); } reader.close(); } return 0; }
From source file:com.dinglicom.clouder.mapreduce.input.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; System.out.println("-------------------length:" + split.getLength() + "\tposition:" + split.getStart()); Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();//from w ww . j ava 2 s . co m end = start + split.getLength(); final Path file = split.getPath(); key = new Text(FileToCDRType.getTypeByPath(file.getName())); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java
License:Open Source License
public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException { LOG.warn("split start: " + split.getStart()); LOG.warn("split length: " + split.getLength()); String[] locs = split.getLocations(); for (String loc : locs) { LOG.warn("location: " + loc); }/*from w w w. j a v a 2s. c o m*/ start = split.getStart(); end = start + split.getLength(); LOG.warn("split end: " + end); final Path file = split.getPath(); LOG.warn("file: " + file.getName()); LOG.warn("INT split start: " + (int) split.getStart()); LOG.warn("INT split length: " + (int) split.getLength()); LOG.warn("INT split end: " + (int) end); FileSystem fs = file.getFileSystem(conf); codecFactory = new CompressionCodecFactory(conf); final CompressionCodec codec = codecFactory.getCodec(file); LOG.warn("codec: " + codec.toString()); LOG.warn("config: " + conf.toString()); if (codec == null) { throw new IOException("No LZO codec found, cannot run."); } // Open the file and seek to the next split. fileIn = fs.open(file); // Create input stream and read the file header. in = new LineReader(codec.createInputStream(fileIn), conf); if (start != 0) { fileIn.seek(start); LOG.warn("fileIn position: " + fileIn.getPos()); LOG.warn("buffer size: " + conf.get("io.file.buffer.size")); // Read and ignore the first line. in.readLine(new Text()); start = fileIn.getPos(); } pos = start; }
From source file:com.ebay.erl.mobius.core.MobiusJob.java
License:Apache License
/** * Select the <code>columns</code> from the <code>dataset</code>, store * it into <code>outputFolder</code> with the given <code>outputFormat</code> * <p>// w w w .j av a 2 s . c o m * * Here is an example: * <pre> * <code> * public MyJob extends MobiusJob * { * public void run(String[] args) * { * Dataset students = ...; * * // save the result to $OUTPUT in SequenceFileOutputFormat, * // the key will be NullWritable, and the value is a Tuple * // which contains 3 columns, id, f_name and l_name. * this.list(students, * new Path("$OUTPUT"), * SequenceFileOutputFormat.class, * new Column(students, "id"), * new Column(students, "f_name"), * new Column(students, "l_name") * ); * } * * public static void main(String[] args) throw Exception * { * System.exit(MobiusJobRunner.run(new MyJob(), args)); * } * } * </code> * </pre> */ public Dataset list(Dataset dataset, Path outputFolder, Class<? extends FileOutputFormat> outputFormat, Column... columns) throws IOException { byte datasetID = 0;// set to 0 as there is only one dataset to be operated. JobConf job = dataset.createJobConf(datasetID); job.set("mapred.job.name", "Listing " + dataset.getName()); job.setJarByClass(this.getClass()); job.setNumReduceTasks(0); // list is map only job job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Tuple.class); job.setJobName("List " + dataset.getName()); JobSetup.validateColumns(dataset, columns); JobSetup.setupInputs(job, dataset, datasetID); JobSetup.setupProjections(job, dataset, datasetID, columns); JobSetup.setupOutputs(job, outputFolder, outputFormat); this.addToExecQueue(job); AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(this).getBuilder(outputFormat, "Dataset_" + outputFolder.getName()); return builder.buildFromPreviousJob(job, outputFormat, Column.toSchemaArray(columns)); }
From source file:com.ebay.erl.mobius.core.Persistable.java
License:Apache License
/** * Save the dataset and store the <code>projections</code> * into a the specified <code>output</code> path in the * format of the given <code>outputFormat</code>. * <p>/*from www. j a va2s.c o m*/ * * Only the rows that meet the <code>criteria</code> will be * stored. The <code>criteria</code> can only evaluate the * columns specified in the <code>projections</code>. * <p> * * <code>output</code> will be deleted before the job gets started. */ public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat, TupleCriterion criteria, Projectable... projections) throws IOException { if (projections == null || projections.length == 0) throw new IllegalArgumentException("Please specify the output columns."); // - VALIDATION - make sure no ambiguous column names. // // make sure the projections don't have two or more different columns that // have the same name but in different dataset, as we are going the use // the {@link Column#getOutputColumnName} as the output schema of the // returned dataset. Set<String> columnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER); for (Projectable aColumn : projections) { String[] outputSchema = aColumn.getOutputSchema(); for (String anOutput : outputSchema) { if (!columnNames.contains(anOutput)) { columnNames.add(anOutput); } else { throw new IllegalArgumentException(columnNames + " from " + aColumn.toString() + " is ambiguous, it has the same name" + "as aother selected projected in different dataset, please use Column#setNewName(String) to" + "change it."); } } } // - VALIDATION - if <code>criteria</code> is not null, need to make // sure the columns used in the criteria are in the output columns. if (criteria != null) { TupleCriterion.validate(columnNames, criteria); this.jobConf.set(ConfigureConstants.PERSISTANT_CRITERIA, SerializableUtil.serializeToBase64(criteria)); } // setup {@link Dataset} to {@link Column} mapping so we can setup projection columns // for each dataset, and also perform validation on making sure all the projection columns // are from the selected <code>datasets</code> only, Map<Dataset, List<Column>> datasetToColumns = new HashMap<Dataset, List<Column>>(); for (Projectable aFunc : projections) { Column[] requiredInputColumns = aFunc.getInputColumns(); for (Column aColumn : requiredInputColumns) { Dataset aDataset = aColumn.getDataset(); // make sure the <code>aDataset</code> within the participated datasets boolean withinSelectedDataset = false; for (Dataset aSelectedDataset : this.datasets) { if (aSelectedDataset.equals(aDataset)) { withinSelectedDataset = true; break; } } if (!withinSelectedDataset) { // user select a column from a dataset that doesn't // in the selected datasets in this join/group by job. throw new IllegalArgumentException(aColumn.toString() + " does not within the selected datasets " + "in this join/group task, please select columns only from the selected datasets."); } List<Column> projectablesInADataset = null; if ((projectablesInADataset = datasetToColumns.get(aDataset)) == null) { projectablesInADataset = new LinkedList<Column>(); datasetToColumns.put(aDataset, projectablesInADataset); } if (!projectablesInADataset.contains(aColumn)) projectablesInADataset.add(aColumn); } } if (datasetToColumns.keySet().size() != this.datasets.length) { throw new IllegalArgumentException( "Please select at least one column from each dataset in the join/group-by job."); } // SETUP JOB if (this.userDefinedConf != null) { this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf)); } this.jobConf.setJarByClass(job.getClass()); this.jobConf.setMapOutputKeyClass(DataJoinKey.class); this.jobConf.setMapOutputValueClass(DataJoinValue.class); this.jobConf.setPartitionerClass(DataJoinKeyPartitioner.class); this.jobConf.setOutputValueGroupingComparator(DataJoinKey.Comparator.class); this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class); this.jobConf.setReducerClass(DefaultMobiusReducer.class); this.jobConf.set(ConfigureConstants.PROJECTION_COLUMNS, SerializableUtil.serializeToBase64(projections)); JobSetup.setupOutputs(this.jobConf, output, outputFormat); // setup input paths, projection columns for each datasets. for (byte assignedDatasetID = 0; assignedDatasetID < this.datasets.length; assignedDatasetID++) { Dataset aDataset = this.datasets[assignedDatasetID]; // setup input for each dataset JobSetup.setupInputs(jobConf, aDataset, assignedDatasetID); // setup projection for each dataset JobSetup.setupProjections(jobConf, aDataset, assignedDatasetID, datasetToColumns.get(aDataset).toArray(new Column[0])); } // setup all dataset IDs for (int i = 0; i < this.datasets.length; i++) { Byte id = this.datasets[i].getID(); if (!this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS, "").isEmpty()) { this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS) + "," + id); } else { this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, id.toString()); } } boolean isCombinable = true; for (Projectable aFunc : projections) { aFunc.setConf(jobConf); if (!aFunc.isCombinable()) { isCombinable = false; LOGGER.info(aFunc.toString() + " is not combinable, #isCombinable() return false."); break; } if (aFunc instanceof GroupFunction && aFunc.useGroupKeyOnly()) { LOGGER.info(aFunc.toString() + " is a group function and use group key as its input only, disable combiner."); isCombinable = false; break; } } LOGGER.info("Using Combiner? " + isCombinable); if (isCombinable) { jobConf.setCombinerClass(DefaultMobiusCombiner.class); } job.addToExecQueue(jobConf); AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat, "Dataset_" + output.getName()); // form the output column from the projections List<String> outputColumns = new ArrayList<String>(); for (Projectable func : projections) { String[] aProjectOutputs = func.getOutputSchema(); for (String anOutputName : aProjectOutputs) { outputColumns.add(anOutputName); } } return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0])); }
From source file:com.ebay.erl.mobius.core.SortPersistable.java
License:Apache License
/** * Save the sort result to the given <code>output</code> with * the specified <code>outputFormat</code>. * <p>/*from w w w. ja v a 2s. c o m*/ * * The returned {@link Dataset} represents the sorted result, * it can be used to do further analysis. */ public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat) throws IOException { // SETUP JOB if (this.userDefinedConf != null) { this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf)); } this.jobConf.setJarByClass(job.getClass()); this.jobConf.setMapOutputKeyClass(DataJoinKey.class); this.jobConf.setMapOutputValueClass(DataJoinValue.class); this.jobConf.setPartitionerClass(TotalOrderPartitioner.class); this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class); this.jobConf.setReducerClass(TotalSortReducer.class); JobSetup.setupOutputs(this.jobConf, output, outputFormat); job.addToExecQueue(this.jobConf); AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat, "Dataset_" + output.getName()); // form the output column from the projections List<String> outputColumns = new ArrayList<String>(); for (Projectable func : projections) { String[] aProjectOutputs = func.getOutputSchema(); for (String anOutputName : aProjectOutputs) { outputColumns.add(anOutputName); } } return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0])); }