Example usage for org.apache.hadoop.fs Path getName

List of usage examples for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName() 

Source Link

Document

Returns the final component of this path.

Usage

From source file:de.huberlin.wbi.hiway.scheduler.Scheduler.java

License:Apache License

protected void parseLogs() {
    String hdfsBaseDirectoryName = conf.get(HiWayConfiguration.HIWAY_AM_DIRECTORY_BASE,
            HiWayConfiguration.HIWAY_AM_DIRECTORY_BASE_DEFAULT);
    String hdfsSandboxDirectoryName = conf.get(HiWayConfiguration.HIWAY_AM_DIRECTORY_CACHE,
            HiWayConfiguration.HIWAY_AM_DIRECTORY_CACHE_DEFAULT);
    Path hdfsBaseDirectory = new Path(new Path(hdfs.getUri()), hdfsBaseDirectoryName);
    Path hdfsSandboxDirectory = new Path(hdfsBaseDirectory, hdfsSandboxDirectoryName);
    try {//w  w w  .j av  a  2  s. com
        for (FileStatus appDirStatus : hdfs.listStatus(hdfsSandboxDirectory)) {
            if (appDirStatus.isDirectory()) {
                Path appDir = appDirStatus.getPath();
                for (FileStatus srcStatus : hdfs.listStatus(appDir)) {
                    Path src = srcStatus.getPath();
                    String srcName = src.getName();
                    if (srcName.endsWith(".log")) {
                        Path dest = new Path(appDir.getName());
                        System.out.println("Parsing log " + dest.toString());
                        hdfs.copyToLocalFile(false, src, dest);

                        try (BufferedReader reader = new BufferedReader(
                                new FileReader(new File(dest.toString())))) {
                            String line;
                            while ((line = reader.readLine()) != null) {
                                JsonReportEntry entry = new JsonReportEntry(line);
                                addEntryToDB(entry);
                            }
                        }
                    }
                }
            }
        }
    } catch (IOException | JSONException e) {
        e.printStackTrace();
        System.exit(-1);
    }
}

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * //from w ww  . j  av  a 2  s . co m
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJobDkbd.java

License:Apache License

@Override
public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException {
    try {/*from  w w  w .j  a v  a2s  .  co m*/
        String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
        String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
        for (int i = 0; i < extractorConfigurationFilesArr.length; i++) {
            String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName();
            for (Path p : DistributedCache.getLocalCacheFiles(conf))
                if (p.getName().contains(extractorConfigurationFileName))
                    extractorConfigurationFilesArr[i] = p.toString();
        }

        int maxlength = conf.getInt(SHARED_CONSTANTS.PARAM_MAXIMUM_PATHLENGTH, -1);

        AggregateBuilder builder = new AggregateBuilder();
        // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class));
        builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline
                .createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */,
                        true/* create_dependencies */, true/* create_new_relations */,
                        true/* create_dependency_path */, false/*ignore_nn_relations*/,
                        maxlength/* dependecy_path_maxlength */, false/* create_detailed_output */,
                        extractorConfigurationFilesArr/* extractor_configuration */,
                        SHARED_CONSTANTS.HADOOP_CAS_CONSUMER_OUTPUT_FILENAME/* output_destination */)));
        return builder.createAggregateDescription();

    } catch (IOException e) {
        throw new ResourceInitializationException(e);
    }

}

From source file:de.tudarmstadt.lt.n2n.hadoop.PreparsedJob.java

License:Apache License

@Override
public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException {
    try {/*from  w w w  .j  av  a 2s .  c  om*/
        String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
        String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
        for (int i = 0; i < extractorConfigurationFilesArr.length; i++) {
            String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName();
            for (Path p : DistributedCache.getLocalCacheFiles(conf))
                if (p.getName().contains(extractorConfigurationFileName))
                    extractorConfigurationFilesArr[i] = p.toString();
        }

        AggregateBuilder builder = new AggregateBuilder();
        // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class));
        builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline
                .createPreparsedRelationEngine(true/* create_tokens */, true/* create_sentences */,
                        true/* create_dependencies */, true/* create_new_relations */,
                        true/* create_dependency_path */, false/*ignore_nn_relations*/,
                        5/* dependecy_path_maxlength */, false/* create_detailed_output */,
                        extractorConfigurationFilesArr/* extractor_configuration */,
                        SHARED_CONSTANTS.HADOOP_CAS_CONSUMER_OUTPUT_FILENAME/* output_destination */)));
        return builder.createAggregateDescription();

    } catch (IOException e) {
        throw new ResourceInitializationException(e);
    }

}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileReader.java

License:Apache License

/**
 * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed
 * on the fly./*from  w w  w.  j  a v  a2 s.c  o  m*/
 *
 * @param conf     The Hadoop configuration.
 * @param filePath The Hadoop path to the file that should be read.
 * @throws IOException I/O exception
 */
public WARCFileReader(Configuration conf, Path filePath) throws IOException {
    FileSystem fs = filePath.getFileSystem(conf);
    this.fileSize = fs.getFileStatus(filePath).getLen();
    logger.info("Reading from " + filePath);

    CompressionCodec codec = filePath.getName().endsWith(".gz") ? WARCFileWriter.getGzipCodec(conf) : null;
    byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath)));
    dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream));
}

From source file:dz.lab.hdfs.LsWithPathFilter.java

/**
 * @param args/*from  w ww .  j a v  a 2s.co m*/
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    Path path = new Path("/");
    // restrict result of listStatus() by supplying PathFilter    
    FileStatus[] files = fs.listStatus(path, new PathFilter() {

        @Override
        public boolean accept(Path path) {
            // do not show path whose name equals to user
            if (path.getName().equals("user")) {
                return false;
            }
            return true;
        }
    });

    for (FileStatus file : files) {
        System.out.println(file.getPath().getName());
    }
}

From source file:eagle.dataproc.impl.storm.hdfs.UserProfileGenerationHDFSSpout.java

License:Apache License

public void copyFiles() {
    LOG.info("Inside listFiles()");
    //Configuration conf = new Configuration();
    JobConf conf = new JobConf();
    // _____________ TO TEST THAT CORRECT HADOOP JARs ARE INCLUDED __________________
    ClassLoader cl = ClassLoader.getSystemClassLoader();
    URL[] urls = ((URLClassLoader) cl).getURLs();
    if (LOG.isDebugEnabled()) {
        for (URL url : urls) {
            LOG.debug(url.getFile());/*from  w  w w.  j av  a  2  s.  c  o m*/
        }
    }
    // _________________________________________
    String hdfsConnectionStr = configContext.getString("dataSourceConfig.hdfsConnection");
    LOG.info("HDFS connection string: " + hdfsConnectionStr);

    String hdfsPath = configContext.getString("dataSourceConfig.hdfsPath");
    LOG.info("HDFS path: " + hdfsPath);

    String copyToPath = configContext.getString("dataSourceConfig.copyToPath");
    LOG.info("copyToPath: " + copyToPath);
    String srcPathStr = new String("hdfs://" + hdfsConnectionStr + hdfsPath);
    Path srcPath = new Path(srcPathStr);
    LOG.info("listFiles called");
    LOG.info("srcPath: " + srcPath);
    try {
        FileSystem fs = srcPath.getFileSystem(conf);
        /*CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); 
        CompressionCodec codec = codecFactory.getCodec(srcPath);
        DataInputStream inputStream = new DataInputStream(codec.createInputStream(fs.open(srcPath)));
        */

        Path destPath = new Path(copyToPath);
        LOG.info("Destination path: " + destPath);
        String userListFileName = configContext.getString("dataSourceConfig.userList");
        //loggerHDFSSpout.info("userListFileName: " + userListFileName);
        List<String> userList = getUser(userListFileName);
        for (String user : userList) {
            Path finalSrcPath = new Path(srcPath.getName() + "/" + user);
            fs.copyToLocalFile(finalSrcPath, destPath);
        }
        LOG.info("Copy to local succeed");
        fs.close();

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

}

From source file:edu.american.student.mnemosyne.core.IngestProcess.java

License:Apache License

/**
 * for every path //  w  ww.j a v  a  2s  .c  o  m
 * grab every line of the file, and throw it into accumulo
 */
@SuppressWarnings("static-access")
public void process() throws ProcessException {
    for (Path path : pathsToProcess) {
        uuid = UUID.randomUUID().toString();
        this.fileName = path.getName();
        HadoopForeman hForeman = new HadoopForeman();
        HadoopJobConfiguration conf = new HadoopJobConfiguration();

        conf.setJarClass(this.getClass());
        conf.setMapperClass(IngestMapper.class);
        conf.setInputFormatClass(TextInputFormat.class);
        conf.overridePathToProcess(path);
        conf.setOutputFormatClass(NullOutputFormat.class);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);
        hForeman.runJob(conf);

    }

}

From source file:edu.cmu.cs.in.hadoop.HoopWholeFileRecordReader.java

License:Open Source License

public HoopWholeFileRecordReader(JobConf aJob, InputSplit aSplit) {
    setClassName("HoopWholeFileRecordReader");
    debug("HoopWholeFileRecordReader ()");

    job = aJob;//from  w  w  w .  j ava2  s . c om

    FileSplit split = (FileSplit) aSplit;

    //this.maxLineLength=job.getInt ("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);

    fileSize = split.getLength();

    final Path file = split.getPath();

    createKeyFromName(file.getName());

    debug("File/Key: " + internalKey + " with size: " + split.getLength());

    compressionCodecs = new CompressionCodecFactory(job);

    final CompressionCodec codec = compressionCodecs.getCodec(file);

    FileSystem fs = null;
    try {
        fs = file.getFileSystem(job);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    FSDataInputStream fileIn = null;

    try {
        fileIn = fs.open(split.getPath());
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    if (codec != null) {
        try {
            inStream = codec.createInputStream(fileIn);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    } else {

        inStream = fileIn;
    }
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

private static Path buildClustersSeq(Configuration conf, Path input, Path clustersIn, Path output,
        DistanceMeasure measure, int maxIterations, String delta) throws IOException {

    KMeansClusterer clusterer = new KMeansClusterer(measure);
    Collection<Cluster> clusters = Lists.newArrayList();

    MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
    if (clusters.isEmpty()) {
        throw new IllegalStateException("Clusters is empty!");
    }//from www  .j  a v  a2  s  .com
    boolean converged = false;
    int iteration = 1;
    while (!converged && iteration <= maxIterations) {
        log.info("K-Means Iteration: {}", iteration);
        FileSystem fs = FileSystem.get(input.toUri(), conf);
        for (VectorWritable value : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST,
                PathFilters.logsCRCFilter(), conf)) {
            clusterer.addPointToNearestCluster(value.get(), clusters);
        }
        converged = clusterer.testConvergence(clusters, Double.parseDouble(delta));
        Path clustersOut = new Path(output, AbstractCluster.CLUSTERS_DIR + iteration);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(clustersOut, "part-r-00000"),
                Text.class, Cluster.class);
        try {
            for (Cluster cluster : clusters) {
                if (log.isDebugEnabled()) {
                    log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] {
                            cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
                            cluster.getNumPoints(), AbstractCluster.formatVector(cluster.getRadius(), null),
                            clustersOut.getName() });
                }
                writer.append(new Text(cluster.getIdentifier()), cluster);
            }
        } finally {
            Closeables.closeQuietly(writer);
        }
        clustersIn = clustersOut;
        iteration++;
    }
    Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1)
            + org.apache.mahout.clustering.Cluster.FINAL_ITERATION_SUFFIX);
    FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1)),
            finalClustersIn);
    return finalClustersIn;
}