Example usage for org.apache.hadoop.fs Path getName

List of usage examples for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName() 

Source Link

Document

Returns the final component of this path.

Usage

From source file:com.ddp.SimpleREST.java

License:Open Source License

private Map<String, FileUpload> getUploadedFiles(RoutingContext ctx) {
    // any number of uploads
    Map<String, FileUpload> files = new HashMap<>();
    for (FileUpload f : ctx.fileUploads()) {
        // do whatever you need to do with the file (it is already saved
        // on the directory you wanted...

        try {/* w w w. j a  v  a2s.  co  m*/
            Path p = new Path(f.uploadedFileName());
            fs.copyFromLocalFile(p, new Path(hdfsUploadHome));
            files.put(hdfsUploadHome + "/" + p.getName(), f);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return files;

}

From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java

License:Apache License

private void generateXMLdocs(String inputf, String outputf) throws IOException {
    Path input = new Path(inputf);

    File output = new File(outputf);
    if (output.exists() && output.isFile()) {
        System.err.println("Output " + outputf + " already exists");
        return;/*from w ww  .j ava2 s.  com*/
    }
    if (output.exists() == false)
        output.mkdirs();

    FileSystem fs = input.getFileSystem(getConf());
    FileStatus[] statuses = fs.listStatus(input);
    int count[] = { 0 };
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        Path suPath = status.getPath();
        if (suPath.getName().equals("_SUCCESS"))
            continue;
        generateXMLdocs(suPath, output, count);
    }
}

From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("v", "vector", true, "input vector sequencefile");
    options.addOption("l", "label", true, "input vector sequencefile");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;// w w w  .  ja v a  2s  .com
    try {
        line = parser.parse(options, args);
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusGenerator", options);
            return 0;
        }
        if (!line.hasOption("v") | !line.hasOption("o") | !line.hasOption("l")) {
            formatter.printHelp("CorpusGenerator", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusGenerator", options);
    }

    Path vectorPath = new Path(line.getOptionValue("v"));
    Path labelPath = new Path(line.getOptionValue("l"));
    String output = line.getOptionValue("o");

    Path tempOutput = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis());

    // extracts the string representations from the vectors
    int retVal = vectorToString(vectorPath, tempOutput);
    if (retVal != 0) {
        HadoopUtil.delete(getConf(), tempOutput);
        return retVal;
    }

    Path tempOutput2 = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis());

    retVal = convert(tempOutput, labelPath, tempOutput2);

    // delete the temp output
    HadoopUtil.delete(getConf(), tempOutput);

    if (retVal != 0) {
        HadoopUtil.delete(getConf(), tempOutput2);
        return retVal;
    }

    // convert tempOutput to standard file
    BufferedWriter bow = new BufferedWriter(new FileWriter(new File(output)));

    // the label dictionary is not dumped to text
    int labelMaxIndex = 0;
    Map<String, Integer> labelIndex = new HashMap<String, Integer>();

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] fss = fs.listStatus(tempOutput2);
    try {
        for (FileStatus status : fss) {
            Path path = status.getPath();
            // skips the _log or _SUCCESS files
            if (!path.getName().startsWith("part-") && !path.getName().equals(tempOutput2.getName()))
                continue;
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
            // read the key + values in that file
            Text key = new Text();
            Text value = new Text();
            while (reader.next(key, value)) {
                String label = key.toString();
                // replace the label by its index
                Integer indexLabel = labelIndex.get(label);
                if (indexLabel == null) {
                    indexLabel = new Integer(labelMaxIndex);
                    labelIndex.put(label, indexLabel);
                    labelMaxIndex++;
                }
                String val = value.toString();
                bow.append(indexLabel.toString()).append(val).append("\n");
            }
            reader.close();
        }
        bow.flush();
    } catch (Exception e) {
        e.printStackTrace();
        return -1;
    } finally {
        bow.close();
        fs.delete(tempOutput2, true);
    }
    return 0;
}

From source file:com.digitalpebble.behemoth.util.ContentExtractor.java

License:Apache License

private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException {

    Path input = new Path(inputf);
    Path dirPath = new Path(outputf);

    FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf());

    if (fsout.exists(dirPath) == false)
        fsout.mkdirs(dirPath);/*  w  ww.j  av a 2  s .co  m*/
    else {
        System.err.println("Output " + outputf + " already exists");
        return -1;
    }

    // index file
    Path indexPath = new Path(dirPath, "index");
    if (fsout.exists(indexPath) == false) {
        fsout.createNewFile(indexPath);
    }

    maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000);

    index = fsout.create(indexPath);

    createArchive(dirPath);

    FileSystem fs = input.getFileSystem(getConf());
    FileStatus[] statuses = fs.listStatus(input);
    int count[] = { 0 };
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        Path suPath = status.getPath();
        if (suPath.getName().equals("_SUCCESS"))
            continue;
        generateDocs(suPath, dirPath, count);
    }

    if (index != null)
        index.close();

    if (currentArchive != null) {
        currentArchive.finish();
        currentArchive.close();
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusReader.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("c", "displayContent", false, "display binary content in output");
    options.addOption("t", "displayText", false, "display text in output");
    options.addOption("a", "displayAnnotations", false, "display annotations in output");
    options.addOption("m", "displayMetadata", false, "display metadata in output");

    // parse the command line arguments
    CommandLine line = null;//from  ww w.ja  v a2  s .  c  om
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusReader", options);
            return 0;
        }
        if (input == null) {
            formatter.printHelp("CorpusReader", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusReader", options);
        return -1;
    }

    boolean showBinaryContent = line.hasOption("displayContent");
    boolean showText = line.hasOption("displayText");
    boolean showAnnotations = line.hasOption("displayAnnotations");
    boolean showMD = line.hasOption("displayMetadata");

    Path inputPath = new Path(line.getOptionValue("i"));

    Configuration conf = getConf();
    FileSystem fs = inputPath.getFileSystem(conf);

    // filter input
    DocumentFilter filters = DocumentFilter.getFilters(conf);
    boolean doFilter = DocumentFilter.isRequired(conf);

    FileStatus[] fss = fs.listStatus(inputPath);
    for (FileStatus status : fss) {
        Path path = status.getPath();
        // skips the _log or _SUCCESS files
        if (!path.getName().startsWith("part-") && !path.getName().equals(inputPath.getName()))
            continue;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        Text key = new Text();
        BehemothDocument value = new BehemothDocument();
        while (reader.next(key, value)) {
            // skip this document?
            if (doFilter && filters.keep(value) == false)
                continue;

            System.out.println(value.toString(showBinaryContent, showAnnotations, showText, showMD));
        }
        reader.close();
    }

    return 0;
}

From source file:com.dinglicom.clouder.mapreduce.input.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    System.out.println("-------------------length:" + split.getLength() + "\tposition:" + split.getStart());
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from  w  ww  .  j ava 2  s  . co m
    end = start + split.getLength();
    final Path file = split.getPath();
    key = new Text(FileToCDRType.getTypeByPath(file.getName()));
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java

License:Open Source License

public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException {
    LOG.warn("split start: " + split.getStart());
    LOG.warn("split length: " + split.getLength());
    String[] locs = split.getLocations();
    for (String loc : locs) {
        LOG.warn("location: " + loc);
    }/*from   w  w w.  j  a v a  2s.  c  o  m*/
    start = split.getStart();
    end = start + split.getLength();
    LOG.warn("split end: " + end);
    final Path file = split.getPath();
    LOG.warn("file: " + file.getName());
    LOG.warn("INT split start: " + (int) split.getStart());
    LOG.warn("INT split length: " + (int) split.getLength());
    LOG.warn("INT split end: " + (int) end);

    FileSystem fs = file.getFileSystem(conf);
    codecFactory = new CompressionCodecFactory(conf);
    final CompressionCodec codec = codecFactory.getCodec(file);
    LOG.warn("codec: " + codec.toString());
    LOG.warn("config: " + conf.toString());
    if (codec == null) {
        throw new IOException("No LZO codec found, cannot run.");
    }

    // Open the file and seek to the next split.
    fileIn = fs.open(file);
    // Create input stream and read the file header.
    in = new LineReader(codec.createInputStream(fileIn), conf);
    if (start != 0) {
        fileIn.seek(start);
        LOG.warn("fileIn position: " + fileIn.getPos());
        LOG.warn("buffer size: " + conf.get("io.file.buffer.size"));

        // Read and ignore the first line.
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    pos = start;
}

From source file:com.ebay.erl.mobius.core.MobiusJob.java

License:Apache License

/**
 * Select the <code>columns</code> from the <code>dataset</code>, store
 * it into <code>outputFolder</code> with the given <code>outputFormat</code>
 * <p>//  w w w .j  av  a 2  s .  c  o m
 * 
 * Here is an example:
 * <pre>
 * <code>
 * public MyJob extends MobiusJob
 * {
 *    public void run(String[] args)
 *    {
 *       Dataset students = ...;
 *       
 *       // save the result to $OUTPUT in SequenceFileOutputFormat,
 *       // the key will be NullWritable, and the value is a Tuple 
 *       // which contains 3 columns, id, f_name and l_name.
 *       this.list(students,
 *          new Path("$OUTPUT"),
 *          SequenceFileOutputFormat.class,
 *          new Column(students, "id"),
 *          new Column(students, "f_name"),
 *          new Column(students, "l_name")
 *       ); 
 *    }
 *    
 *    public static void main(String[] args) throw Exception
 *    {
 *       System.exit(MobiusJobRunner.run(new MyJob(), args));
 *    }
 * }
 * </code>
 * </pre>
 */
public Dataset list(Dataset dataset, Path outputFolder, Class<? extends FileOutputFormat> outputFormat,
        Column... columns) throws IOException {
    byte datasetID = 0;// set to 0 as there is only one dataset to be operated.

    JobConf job = dataset.createJobConf(datasetID);

    job.set("mapred.job.name", "Listing " + dataset.getName());
    job.setJarByClass(this.getClass());
    job.setNumReduceTasks(0); // list is map only job
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Tuple.class);
    job.setJobName("List " + dataset.getName());

    JobSetup.validateColumns(dataset, columns);
    JobSetup.setupInputs(job, dataset, datasetID);
    JobSetup.setupProjections(job, dataset, datasetID, columns);
    JobSetup.setupOutputs(job, outputFolder, outputFormat);

    this.addToExecQueue(job);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(this).getBuilder(outputFormat,
            "Dataset_" + outputFolder.getName());
    return builder.buildFromPreviousJob(job, outputFormat, Column.toSchemaArray(columns));
}

From source file:com.ebay.erl.mobius.core.Persistable.java

License:Apache License

/**
 * Save the dataset and store the <code>projections</code>
 * into a the specified <code>output</code> path in the 
 * format of the given <code>outputFormat</code>.
 * <p>/*from www.  j a va2s.c o  m*/
 * 
 * Only the rows that meet the <code>criteria</code> will be 
 * stored.  The <code>criteria</code> can only evaluate the 
 * columns specified in the <code>projections</code>.
 * <p>
 * 
 * <code>output</code> will be deleted before the job gets started.
 */
public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat,
        TupleCriterion criteria, Projectable... projections) throws IOException {
    if (projections == null || projections.length == 0)
        throw new IllegalArgumentException("Please specify the output columns.");

    // - VALIDATION - make sure no ambiguous column names.
    //
    // make sure the projections don't have two or more different columns that
    // have the same name but in different dataset, as we are going the use 
    // the {@link Column#getOutputColumnName} as the output schema of the
    // returned dataset.
    Set<String> columnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
    for (Projectable aColumn : projections) {
        String[] outputSchema = aColumn.getOutputSchema();
        for (String anOutput : outputSchema) {
            if (!columnNames.contains(anOutput)) {
                columnNames.add(anOutput);
            } else {
                throw new IllegalArgumentException(columnNames + " from " + aColumn.toString()
                        + " is ambiguous, it has the same name"
                        + "as aother selected projected in different dataset, please use Column#setNewName(String) to"
                        + "change it.");
            }
        }
    }

    // - VALIDATION - if <code>criteria</code> is not null, need to make
    // sure the columns used in the criteria are in the output columns.
    if (criteria != null) {
        TupleCriterion.validate(columnNames, criteria);
        this.jobConf.set(ConfigureConstants.PERSISTANT_CRITERIA, SerializableUtil.serializeToBase64(criteria));
    }

    // setup {@link Dataset} to {@link Column} mapping so we can setup projection columns
    // for each dataset, and also perform validation on making sure all the projection columns 
    // are from the selected <code>datasets</code> only,
    Map<Dataset, List<Column>> datasetToColumns = new HashMap<Dataset, List<Column>>();

    for (Projectable aFunc : projections) {
        Column[] requiredInputColumns = aFunc.getInputColumns();
        for (Column aColumn : requiredInputColumns) {
            Dataset aDataset = aColumn.getDataset();
            // make sure the <code>aDataset</code> within the participated datasets
            boolean withinSelectedDataset = false;
            for (Dataset aSelectedDataset : this.datasets) {
                if (aSelectedDataset.equals(aDataset)) {
                    withinSelectedDataset = true;
                    break;
                }
            }

            if (!withinSelectedDataset) {
                // user select a column from a dataset that doesn't
                // in the selected datasets in this join/group by job.
                throw new IllegalArgumentException(aColumn.toString()
                        + " does not within the selected datasets "
                        + "in this join/group task, please select columns only from the selected datasets.");
            }

            List<Column> projectablesInADataset = null;
            if ((projectablesInADataset = datasetToColumns.get(aDataset)) == null) {
                projectablesInADataset = new LinkedList<Column>();
                datasetToColumns.put(aDataset, projectablesInADataset);
            }

            if (!projectablesInADataset.contains(aColumn))
                projectablesInADataset.add(aColumn);
        }
    }

    if (datasetToColumns.keySet().size() != this.datasets.length) {
        throw new IllegalArgumentException(
                "Please select at least one column from each dataset in the join/group-by job.");
    }

    // SETUP JOB
    if (this.userDefinedConf != null) {
        this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf));
    }
    this.jobConf.setJarByClass(job.getClass());
    this.jobConf.setMapOutputKeyClass(DataJoinKey.class);
    this.jobConf.setMapOutputValueClass(DataJoinValue.class);
    this.jobConf.setPartitionerClass(DataJoinKeyPartitioner.class);
    this.jobConf.setOutputValueGroupingComparator(DataJoinKey.Comparator.class);
    this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class);
    this.jobConf.setReducerClass(DefaultMobiusReducer.class);
    this.jobConf.set(ConfigureConstants.PROJECTION_COLUMNS, SerializableUtil.serializeToBase64(projections));

    JobSetup.setupOutputs(this.jobConf, output, outputFormat);

    // setup input paths, projection columns for each datasets.
    for (byte assignedDatasetID = 0; assignedDatasetID < this.datasets.length; assignedDatasetID++) {
        Dataset aDataset = this.datasets[assignedDatasetID];

        // setup input for each dataset
        JobSetup.setupInputs(jobConf, aDataset, assignedDatasetID);

        // setup projection for each dataset
        JobSetup.setupProjections(jobConf, aDataset, assignedDatasetID,
                datasetToColumns.get(aDataset).toArray(new Column[0]));
    }

    // setup all dataset IDs
    for (int i = 0; i < this.datasets.length; i++) {
        Byte id = this.datasets[i].getID();
        if (!this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS, "").isEmpty()) {
            this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS,
                    this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS) + "," + id);
        } else {
            this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, id.toString());
        }
    }

    boolean isCombinable = true;
    for (Projectable aFunc : projections) {
        aFunc.setConf(jobConf);

        if (!aFunc.isCombinable()) {
            isCombinable = false;
            LOGGER.info(aFunc.toString() + " is not combinable, #isCombinable() return false.");
            break;
        }
        if (aFunc instanceof GroupFunction && aFunc.useGroupKeyOnly()) {
            LOGGER.info(aFunc.toString()
                    + " is a group function and use group key as its input only, disable combiner.");
            isCombinable = false;
            break;
        }
    }

    LOGGER.info("Using Combiner? " + isCombinable);
    if (isCombinable) {
        jobConf.setCombinerClass(DefaultMobiusCombiner.class);
    }

    job.addToExecQueue(jobConf);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat,
            "Dataset_" + output.getName());

    // form the output column from the projections
    List<String> outputColumns = new ArrayList<String>();
    for (Projectable func : projections) {
        String[] aProjectOutputs = func.getOutputSchema();
        for (String anOutputName : aProjectOutputs) {
            outputColumns.add(anOutputName);
        }
    }

    return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0]));
}

From source file:com.ebay.erl.mobius.core.SortPersistable.java

License:Apache License

/**
 * Save the sort result to the given <code>output</code> with
 * the specified <code>outputFormat</code>.
 * <p>/*from w  w  w.  ja v a 2s.  c o m*/
 * 
 * The returned {@link Dataset} represents the sorted result,
 * it can be used to do further analysis.
 */
public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat)
        throws IOException {
    // SETUP JOB
    if (this.userDefinedConf != null) {
        this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf));
    }
    this.jobConf.setJarByClass(job.getClass());
    this.jobConf.setMapOutputKeyClass(DataJoinKey.class);
    this.jobConf.setMapOutputValueClass(DataJoinValue.class);
    this.jobConf.setPartitionerClass(TotalOrderPartitioner.class);
    this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class);
    this.jobConf.setReducerClass(TotalSortReducer.class);

    JobSetup.setupOutputs(this.jobConf, output, outputFormat);

    job.addToExecQueue(this.jobConf);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat,
            "Dataset_" + output.getName());

    // form the output column from the projections
    List<String> outputColumns = new ArrayList<String>();
    for (Projectable func : projections) {
        String[] aProjectOutputs = func.getOutputSchema();
        for (String anOutputName : aProjectOutputs) {
            outputColumns.add(anOutputName);
        }
    }

    return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0]));
}