List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:de.huberlin.wbi.hiway.scheduler.Scheduler.java
License:Apache License
protected void parseLogs() { String hdfsBaseDirectoryName = conf.get(HiWayConfiguration.HIWAY_AM_DIRECTORY_BASE, HiWayConfiguration.HIWAY_AM_DIRECTORY_BASE_DEFAULT); String hdfsSandboxDirectoryName = conf.get(HiWayConfiguration.HIWAY_AM_DIRECTORY_CACHE, HiWayConfiguration.HIWAY_AM_DIRECTORY_CACHE_DEFAULT); Path hdfsBaseDirectory = new Path(new Path(hdfs.getUri()), hdfsBaseDirectoryName); Path hdfsSandboxDirectory = new Path(hdfsBaseDirectory, hdfsSandboxDirectoryName); try {//w w w .j av a 2 s. com for (FileStatus appDirStatus : hdfs.listStatus(hdfsSandboxDirectory)) { if (appDirStatus.isDirectory()) { Path appDir = appDirStatus.getPath(); for (FileStatus srcStatus : hdfs.listStatus(appDir)) { Path src = srcStatus.getPath(); String srcName = src.getName(); if (srcName.endsWith(".log")) { Path dest = new Path(appDir.getName()); System.out.println("Parsing log " + dest.toString()); hdfs.copyToLocalFile(false, src, dest); try (BufferedReader reader = new BufferedReader( new FileReader(new File(dest.toString())))) { String line; while ((line = reader.readLine()) != null) { JsonReportEntry entry = new JsonReportEntry(line); addEntryToDB(entry); } } } } } } } catch (IOException | JSONException e) { e.printStackTrace(); System.exit(-1); } }
From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
License:Apache License
/** * Reading a bunch of lines of file paths in a list. * The code in this method is redistributed from Hadoop LineRecordReader * //from w ww . j av a 2 s . co m * @throws IOException */ private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); long begin = fileSplit.getStart(); long end = begin + fileSplit.getLength(); LOG.info("Reading paths in file " + path.getName()); // First check the compression codec CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodec.getCodec(path); FSDataInputStream fis = fs.open(path); SplitLineReader in; Seekable filePosition; boolean compressed = false; Decompressor decompressor = null; if (null != codec) { compressed = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis, decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, conf, (byte[]) null); begin = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null); filePosition = fis; } } else { fis.seek(begin); in = new SplitLineReader(fis, conf, (byte[]) null); filePosition = fis; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (begin != 0) { begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end)); } long pos = begin; int newSize = 0; final Text nextLine = new Text(); paths = new ArrayList<>(); while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) { if (pos == 0) { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE); pos += newSize; int textLength = nextLine.getLength(); byte[] textBytes = nextLine.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and // not do the copyBytes textBytes = nextLine.copyBytes(); nextLine.set(textBytes, 3, textLength); } else { nextLine.clear(); } } } else { newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end)); pos += newSize; } paths.add(nextLine.toString()); // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } try { if (in != null) { in.close(); } if (fis != null) { fis.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJobDkbd.java
License:Apache License
@Override public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException { try {/*from w w w .j a v a2s . co m*/ String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) { String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName(); for (Path p : DistributedCache.getLocalCacheFiles(conf)) if (p.getName().contains(extractorConfigurationFileName)) extractorConfigurationFilesArr[i] = p.toString(); } int maxlength = conf.getInt(SHARED_CONSTANTS.PARAM_MAXIMUM_PATHLENGTH, -1); AggregateBuilder builder = new AggregateBuilder(); // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class)); builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline .createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */, true/* create_dependencies */, true/* create_new_relations */, true/* create_dependency_path */, false/*ignore_nn_relations*/, maxlength/* dependecy_path_maxlength */, false/* create_detailed_output */, extractorConfigurationFilesArr/* extractor_configuration */, SHARED_CONSTANTS.HADOOP_CAS_CONSUMER_OUTPUT_FILENAME/* output_destination */))); return builder.createAggregateDescription(); } catch (IOException e) { throw new ResourceInitializationException(e); } }
From source file:de.tudarmstadt.lt.n2n.hadoop.PreparsedJob.java
License:Apache License
@Override public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException { try {/*from w w w .j av a 2s . c om*/ String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) { String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName(); for (Path p : DistributedCache.getLocalCacheFiles(conf)) if (p.getName().contains(extractorConfigurationFileName)) extractorConfigurationFilesArr[i] = p.toString(); } AggregateBuilder builder = new AggregateBuilder(); // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class)); builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline .createPreparsedRelationEngine(true/* create_tokens */, true/* create_sentences */, true/* create_dependencies */, true/* create_new_relations */, true/* create_dependency_path */, false/*ignore_nn_relations*/, 5/* dependecy_path_maxlength */, false/* create_detailed_output */, extractorConfigurationFilesArr/* extractor_configuration */, SHARED_CONSTANTS.HADOOP_CAS_CONSUMER_OUTPUT_FILENAME/* output_destination */))); return builder.createAggregateDescription(); } catch (IOException e) { throw new ResourceInitializationException(e); } }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileReader.java
License:Apache License
/** * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed * on the fly./*from w w w. j a v a2 s.c o m*/ * * @param conf The Hadoop configuration. * @param filePath The Hadoop path to the file that should be read. * @throws IOException I/O exception */ public WARCFileReader(Configuration conf, Path filePath) throws IOException { FileSystem fs = filePath.getFileSystem(conf); this.fileSize = fs.getFileStatus(filePath).getLen(); logger.info("Reading from " + filePath); CompressionCodec codec = filePath.getName().endsWith(".gz") ? WARCFileWriter.getGzipCodec(conf) : null; byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath))); dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream)); }
From source file:dz.lab.hdfs.LsWithPathFilter.java
/** * @param args/*from w ww . j a v a 2s.co m*/ * @throws IOException */ public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path("/"); // restrict result of listStatus() by supplying PathFilter FileStatus[] files = fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { // do not show path whose name equals to user if (path.getName().equals("user")) { return false; } return true; } }); for (FileStatus file : files) { System.out.println(file.getPath().getName()); } }
From source file:eagle.dataproc.impl.storm.hdfs.UserProfileGenerationHDFSSpout.java
License:Apache License
public void copyFiles() { LOG.info("Inside listFiles()"); //Configuration conf = new Configuration(); JobConf conf = new JobConf(); // _____________ TO TEST THAT CORRECT HADOOP JARs ARE INCLUDED __________________ ClassLoader cl = ClassLoader.getSystemClassLoader(); URL[] urls = ((URLClassLoader) cl).getURLs(); if (LOG.isDebugEnabled()) { for (URL url : urls) { LOG.debug(url.getFile());/*from w w w. j av a 2 s. c o m*/ } } // _________________________________________ String hdfsConnectionStr = configContext.getString("dataSourceConfig.hdfsConnection"); LOG.info("HDFS connection string: " + hdfsConnectionStr); String hdfsPath = configContext.getString("dataSourceConfig.hdfsPath"); LOG.info("HDFS path: " + hdfsPath); String copyToPath = configContext.getString("dataSourceConfig.copyToPath"); LOG.info("copyToPath: " + copyToPath); String srcPathStr = new String("hdfs://" + hdfsConnectionStr + hdfsPath); Path srcPath = new Path(srcPathStr); LOG.info("listFiles called"); LOG.info("srcPath: " + srcPath); try { FileSystem fs = srcPath.getFileSystem(conf); /*CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(srcPath); DataInputStream inputStream = new DataInputStream(codec.createInputStream(fs.open(srcPath))); */ Path destPath = new Path(copyToPath); LOG.info("Destination path: " + destPath); String userListFileName = configContext.getString("dataSourceConfig.userList"); //loggerHDFSSpout.info("userListFileName: " + userListFileName); List<String> userList = getUser(userListFileName); for (String user : userList) { Path finalSrcPath = new Path(srcPath.getName() + "/" + user); fs.copyToLocalFile(finalSrcPath, destPath); } LOG.info("Copy to local succeed"); fs.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:edu.american.student.mnemosyne.core.IngestProcess.java
License:Apache License
/** * for every path // w ww.j a v a 2s .c o m * grab every line of the file, and throw it into accumulo */ @SuppressWarnings("static-access") public void process() throws ProcessException { for (Path path : pathsToProcess) { uuid = UUID.randomUUID().toString(); this.fileName = path.getName(); HadoopForeman hForeman = new HadoopForeman(); HadoopJobConfiguration conf = new HadoopJobConfiguration(); conf.setJarClass(this.getClass()); conf.setMapperClass(IngestMapper.class); conf.setInputFormatClass(TextInputFormat.class); conf.overridePathToProcess(path); conf.setOutputFormatClass(NullOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); hForeman.runJob(conf); } }
From source file:edu.cmu.cs.in.hadoop.HoopWholeFileRecordReader.java
License:Open Source License
public HoopWholeFileRecordReader(JobConf aJob, InputSplit aSplit) { setClassName("HoopWholeFileRecordReader"); debug("HoopWholeFileRecordReader ()"); job = aJob;//from w w w . j ava2 s . c om FileSplit split = (FileSplit) aSplit; //this.maxLineLength=job.getInt ("mapred.linerecordreader.maxlength",Integer.MAX_VALUE); fileSize = split.getLength(); final Path file = split.getPath(); createKeyFromName(file.getName()); debug("File/Key: " + internalKey + " with size: " + split.getLength()); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = null; try { fs = file.getFileSystem(job); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } FSDataInputStream fileIn = null; try { fileIn = fs.open(split.getPath()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (codec != null) { try { inStream = codec.createInputStream(fileIn); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { inStream = fileIn; } }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
private static Path buildClustersSeq(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, int maxIterations, String delta) throws IOException { KMeansClusterer clusterer = new KMeansClusterer(measure); Collection<Cluster> clusters = Lists.newArrayList(); MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters); if (clusters.isEmpty()) { throw new IllegalStateException("Clusters is empty!"); }//from www .j a v a2 s .com boolean converged = false; int iteration = 1; while (!converged && iteration <= maxIterations) { log.info("K-Means Iteration: {}", iteration); FileSystem fs = FileSystem.get(input.toUri(), conf); for (VectorWritable value : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { clusterer.addPointToNearestCluster(value.get(), clusters); } converged = clusterer.testConvergence(clusters, Double.parseDouble(delta)); Path clustersOut = new Path(output, AbstractCluster.CLUSTERS_DIR + iteration); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(clustersOut, "part-r-00000"), Text.class, Cluster.class); try { for (Cluster cluster : clusters) { if (log.isDebugEnabled()) { log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] { cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumPoints(), AbstractCluster.formatVector(cluster.getRadius(), null), clustersOut.getName() }); } writer.append(new Text(cluster.getIdentifier()), cluster); } } finally { Closeables.closeQuietly(writer); } clustersIn = clustersOut; iteration++; } Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1) + org.apache.mahout.clustering.Cluster.FINAL_ITERATION_SUFFIX); FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1)), finalClustersIn); return finalClustersIn; }