List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java
License:Apache License
/** * Get avro schema of first input file that matches the location pattern. * * @param paths set of input files/*from w w w . ja v a 2s.c o m*/ * @param conf configuration * @return avro schema * @throws IOException */ protected Schema getAvroSchema(Set<Path> paths, Configuration conf) throws IOException { if (paths == null || paths.isEmpty()) { return null; } Iterator<Path> iterator = paths.iterator(); Schema schema = null; while (iterator.hasNext()) { Path path = iterator.next(); FileSystem fs = FileSystem.get(path.toUri(), conf); schema = getAvroSchema(path, fs); if (schema != null) { break; } } return schema; }
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java
License:Apache License
/** * Merge multiple input avro schemas into one. Note that we can't merge arbitrary schemas. * Please see AvroStorageUtils.mergeSchema() for what's allowed and what's not allowed. * * @param basePaths set of input dir or files * @param conf configuration// ww w . j a v a 2s .c om * @return avro schema * @throws IOException */ protected Schema getMergedSchema(Set<Path> basePaths, Configuration conf) throws IOException { Schema result = null; Map<Path, Schema> mergedFiles = new HashMap<Path, Schema>(); Set<Path> paths = AvroStorageUtils.getAllFilesRecursively(basePaths, conf); for (Path path : paths) { FileSystem fs = FileSystem.get(path.toUri(), conf); Schema schema = getSchema(path, fs); if (schema != null) { result = AvroStorageUtils.mergeSchema(result, schema); mergedFiles.put(path, schema); } } // schemaToMergedSchemaMap is only needed when merging multiple records. if ((schemaToMergedSchemaMap == null || schemaToMergedSchemaMap.isEmpty()) && mergedFiles.size() > 1 && result.getType().equals(Schema.Type.RECORD)) { schemaToMergedSchemaMap = AvroStorageUtils.getSchemaToMergedSchemaMap(result, mergedFiles); } return result; }
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java
License:Apache License
/** * Initialize output avro schema using input property map *///ww w . j ava 2s. c o m protected void init(Map<String, Object> inputs) throws IOException { /*used to store field schemas */ List<Field> fields = null; /* set debug level */ if (inputs.containsKey("debug")) { AvroStorageLog.setDebugLevel((Integer) inputs.get("debug")); } /* initialize schema manager, if any */ AvroSchemaManager schemaManager = null; if (inputs.containsKey("data")) { Path path = new Path((String) inputs.get("data")); AvroStorageLog.details("data path=" + path.toUri().toString()); FileSystem fs = FileSystem.get(path.toUri(), new Configuration()); Schema schema = getAvroSchema(path, fs); schemaManager = new AvroSchemaManager(schema); } else if (inputs.containsKey("schema_file")) { Path path = new Path((String) inputs.get("schema_file")); AvroStorageLog.details("schemaFile path=" + path.toUri().toString()); FileSystem fs = FileSystem.get(path.toUri(), new Configuration()); Schema schema = getSchemaFromFile(path, fs); schemaManager = new AvroSchemaManager(schema); } /* iterate input property map */ for (Entry<String, Object> entry : inputs.entrySet()) { String name = entry.getKey().trim(); Object value = entry.getValue(); if (name.equalsIgnoreCase("index")) { /* set index of store function */ storeFuncIndex = (Integer) value; } else if (name.equalsIgnoreCase("same")) { /* use schema in the specified path as output schema */ Path path = new Path(((String) value).trim()); AvroStorageLog.details("data path=" + path.toUri().toString()); FileSystem fs = FileSystem.get(path.toUri(), new Configuration()); outputAvroSchema = getAvroSchema(path, fs); } else if (name.equalsIgnoreCase("nullable")) { nullable = (Boolean) value; } else if (name.equalsIgnoreCase("schema")) { outputAvroSchema = Schema.parse((String) value); userSpecifiedAvroSchema = outputAvroSchema; } else if (name.equalsIgnoreCase("schema_uri")) { /* use the contents of the specified path as output schema */ Path path = new Path(((String) value).trim()); AvroStorageLog.details("schema_uri path=" + path.toUri().toString()); FileSystem fs = FileSystem.get(path.toUri(), new Configuration()); outputAvroSchema = getSchemaFromFile(path, fs); userSpecifiedAvroSchema = outputAvroSchema; } else if (name.matches("field\\d+")) { /*set schema of dth field */ if (fields == null) fields = new ArrayList<Field>(); int index = Integer.parseInt(name.substring("field".length())); String content = ((String) value).trim(); Field field = null; if (content.equalsIgnoreCase(NOTNULL)) { /* null means deriving avro schema from pig schema but not null*/ field = AvroStorageUtils.createUDField(index, null); } else if (content.startsWith("def:")) { if (schemaManager == null) throw new IOException("Please specify data parameter (using \"data\") before this one."); String alias = content.substring("def:".length()); Schema s = schemaManager.getSchema(alias); if (s == null) throw new IOException("Cannot find matching schema for alias:" + alias); /* use pre-defined schema*/ field = AvroStorageUtils.createUDField(index, s); AvroStorageLog.details("Use pre-defined schema(" + alias + "): " + s + " for field " + index); } else { Schema schema = null; try { schema = Schema.parse(content); } catch (RuntimeException e) { /* might be primary schema like int or long */ schema = Schema.parse("\"" + content + "\""); } field = AvroStorageUtils.createUDField(index, schema); } fields.add(field); } else if (!name.equalsIgnoreCase("data") && !name.equalsIgnoreCase("schema_file") && !name.equalsIgnoreCase("debug")) { throw new IOException("Invalid parameter:" + name); } } /* if schemas of some fields are set */ if (fields != null && outputAvroSchema == null) { outputAvroSchema = AvroStorageUtils.createUDPartialRecordSchema(); outputAvroSchema.setFields(fields); } /* print warning if both output and nullable are specified; * and nullable will be ignored.*/ if (outputAvroSchema != null) { if (!nullable) { AvroStorageLog.warn("Invalid parameter--nullable cannot be false while " + "output schema is not null. Will ignore nullable.\n\n"); nullable = true; } } }
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java
License:Apache License
/** * Returns all non-hidden files recursively inside the base paths given * * @throws IOException//from w w w. j av a2s .co m */ public static Set<Path> getAllFilesRecursively(Set<Path> basePaths, Configuration conf) throws IOException { Set<Path> paths = new HashSet<Path>(); for (Path path : basePaths) { FileSystem fs = FileSystem.get(path.toUri(), conf); FileStatus f = fs.getFileStatus(path); if (f.isDir()) { getAllFilesInternal(f, conf, paths, fs); } else { paths.add(path); } } return paths; }
From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java
License:Apache License
private void addDepsJarToDistributedCache(Path path, Job job) throws IOException { LOGGER.info("Trying to add all the deps jar files from directory: {}", path); FileSystem fs = FileSystem.get(getConf()); FileStatus[] fileStatusArr = fs.listStatus(path); for (FileStatus fileStatus : fileStatusArr) { if (fileStatus.isDirectory()) { addDepsJarToDistributedCache(fileStatus.getPath(), job); } else {// ww w. j a va 2 s . c o m Path depJarPath = fileStatus.getPath(); if (depJarPath.getName().endsWith(".jar")) { LOGGER.info("Adding deps jar files: {}", path); job.addCacheArchive(path.toUri()); } } } }
From source file:com.linkedin.pinot.hadoop.job.SegmentUriPushJob.java
License:Apache License
public void pushOneTarFile(FileSystem fs, Path path) throws Exception { String fileName = path.getName(); if (!fileName.endsWith(".tar.gz")) { return;/* w ww . j a v a 2s. c om*/ } for (String host : _hosts) { String uri = String.format("%s%s%s", _pushUriPrefix, path.toUri().getRawPath(), _pushUriSuffix); LOGGER.info("******** Upoading file: {} to Host: {} and Port: {} with download uri: {} *******", fileName, host, _port, uri); try { int responseCode = FileUploadUtils.sendSegmentUri(host, _port, uri); LOGGER.info("Response code: {}", responseCode); } catch (Exception e) { LOGGER.error("******** Error Upoading file: {} to Host: {} and Port: {} *******", fileName, host, _port); LOGGER.error("Caught exception during upload", e); throw new RuntimeException("Got Error during send tar files to push hosts!"); } } }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\nUsing temporary directory " + tmpDir.toUri().getPath() + "\n"); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); removableFiles = new HashSet<String>(); /*/* ww w. ja v a 2s. c om*/ * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Bucketer partitionBucketer = new Bucketer(maxTasks, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); int fileCount = 0; //Path bucketFile = new Path(tmpIn, "dirs_" + fileCount++); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { String dirPath = dir.toUri().getPath(); print(Verbosity.INFO, "\n\n[" + dirPath + "]"); jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFilesMatcher == null) return true; ignoredFilesMatcher.reset(testPath.toUri().getPath()); boolean ignores = ignoredFilesMatcher.matches(); if (ignores) LOG.info("Ignoring file " + testPath); return !ignores; } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, "\n Directory is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { String filePath = path.toUri().getPath(); boolean skipFile = false; if (skippedFilesMatcher != null) { skippedFilesMatcher.reset(filePath); if (skippedFilesMatcher.matches()) { skipFile = true; } } boolean changed = uncrushedFiles.add(filePath); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (!skipFile && fileLength <= maxEligibleSize) { if (removeEmptyFiles && fileLength == 0) removableFiles.add(filePath); else { crushables.add(content); crushableBytes += fileLength; } } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, "\n Directory has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); print(Verbosity.INFO, "\n Directory skipped"); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, "\n Generating " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> filesInBucket = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), filesInBucket.size())); key.set(bucketId); for (String f : filesInBucket) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); /* * Write one row per file to maximize the number of mappers */ writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, filesInBucket.size()); partitionBucketer.add(crushFile); } } } if (!removableFiles.isEmpty()) { print(Verbosity.INFO, "\n Marked " + removableFiles.size() + " files for removal"); for (String removable : removableFiles) { uncrushedFiles.remove(removable); print(Verbosity.VERBOSE, "\n " + removable); } jobCounters.incrCounter(MapperCounter.FILES_REMOVED, removableFiles.size()); } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { writer.close(); } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= maxTasks; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); int totalReducers = 0; for (Bucket partition : partitions) { String partitionName = partition.name(); int p = Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)); partNum.set(p); if (partition.contents().size() > 0) totalReducers++; for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } writer.close(); print(Verbosity.INFO, "\n\nNumber of allocated reducers = " + totalReducers); job.setInt("mapreduce.job.reduces", totalReducers); DataOutputStream countersStream = fs.create(this.counters); jobCounters.write(countersStream); countersStream.close(); }
From source file:com.m6d.filecrush.crush.integration.CrushMapReduceTest.java
License:Apache License
@Before @Override/*from www . j av a2s .c o m*/ public void setUp() throws Exception { super.setUp(); job = createJobConf(); job.setBoolean("mapreduce.output.fileoutputformat.compress", true); job.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.name()); job.set("mapreduce.output.fileoutputformat.compress.codec", CustomCompressionCodec.class.getName()); FileSystem fs = getFileSystem(); Path homeDirPath = fs.makeQualified(new Path(".")); homeDir = homeDirPath.toUri().getPath(); fs.delete(homeDirPath, true); defaultCodec = new DefaultCodec(); defaultCodec.setConf(job); customCodec = new CustomCompressionCodec(); customCodec.setConf(job); }
From source file:com.marklogic.contentpump.ArchiveInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = super.getSplits(job); Iterator<InputSplit> iter = splits.iterator(); while (iter.hasNext()) { InputSplit s = iter.next();/*www . j a v a 2 s . com*/ Path file = ((FileSplit) s).getPath(); String zipfile = file.toUri().getPath(); if (LOG.isDebugEnabled()) { LOG.debug("Zip file name: " + zipfile); } int index = file.toUri().getPath().lastIndexOf(EXTENSION); if (index == -1) { throw new IOException("Archive file should have suffix .zip"); } String subStr = file.toUri().getPath().substring(0, index); index = subStr.lastIndexOf('-'); if (index == -1) { throw new IOException("Not type information in Archive name"); } String typeStr = subStr.substring(index + 1, subStr.length()); try { ContentType.valueOf(typeStr); } catch (IllegalArgumentException ex) { LOG.error("Not a valid archive: " + zipfile); iter.remove(); } } return splits; }
From source file:com.marklogic.contentpump.DocumentPathFilter.java
License:Apache License
@Override public void setConf(Configuration conf) { this.conf = conf; pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*"); String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY); if (LOG.isDebugEnabled()) { LOG.debug(ConfigConstants.CONF_INPUT_DIRECTORY + ": " + inPath); }/*from w ww .ja v a 2 s. co m*/ Path path = new Path(inPath); try { fs = FileSystem.get(path.toUri(), conf); } catch (IOException e) { LOG.error("Please check path: " + inPath, e); } }