List of usage examples for org.apache.hadoop.io MapWritable put
@Override
public Writable put(Writable key, Writable value)
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchUtils.java
License:Apache License
/** * Encodes a particular HmmModel as a Sequence File and write it to the specified location. * * @param model HmmModel to be encoded * @param modelPath Location to store the encoded model * @param conf Configuration object * @throws IOException/*w w w.j av a 2s . c o m*/ */ protected static void writeModelToDirectory(HmmModel model, Path modelPath, Configuration conf) throws IOException { int numHidden = model.getNrOfHiddenStates(); int numObserved = model.getNrOfOutputStates(); Matrix emissionMatrix = model.getEmissionMatrix(); Matrix transitionMatrix = model.getTransitionMatrix(); Vector initialProbability = model.getInitialProbabilities(); MapWritable initialDistributionMap = new MapWritable(); MapWritable transitionDistributionMap = new MapWritable(); MapWritable emissionDistributionMap = new MapWritable(); // delete the output directory HadoopUtil.delete(conf, modelPath); // create new file to store HMM FileSystem fs = FileSystem.get(modelPath.toUri(), conf); Path outFile = new Path(modelPath, "part-randomSeed"); boolean newFile = fs.createNewFile(outFile); if (newFile) { SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, MapWritable.class); try { for (int i = 0; i < numHidden; i++) { IntWritable initialDistributionKey = new IntWritable(i); DoubleWritable initialDistributionValue = new DoubleWritable(initialProbability.get(i)); initialDistributionMap.put(initialDistributionKey, initialDistributionValue); Text transitionDistributionKey = new Text("TRANSIT_" + Integer.toString(i)); MapWritable transitionDistributionValue = new MapWritable(); for (int j = 0; j < numHidden; j++) { IntWritable transitionDistributionInnerKey = new IntWritable(j); DoubleWritable transitionDistributionInnerValue = new DoubleWritable( transitionMatrix.get(i, j)); transitionDistributionValue.put(transitionDistributionInnerKey, transitionDistributionInnerValue); } transitionDistributionMap.put(transitionDistributionKey, transitionDistributionValue); Text emissionDistributionKey = new Text("EMIT_" + Integer.toString(i)); MapWritable emissionDistributionValue = new MapWritable(); for (int j = 0; j < numObserved; j++) { IntWritable emissionDistributionInnerKey = new IntWritable(j); DoubleWritable emissionDistributionInnerValue = new DoubleWritable( emissionMatrix.get(i, j)); emissionDistributionValue.put(emissionDistributionInnerKey, emissionDistributionInnerValue); } emissionDistributionMap.put(emissionDistributionKey, emissionDistributionValue); } writer.append(new Text("INITIAL"), initialDistributionMap); log.info("Wrote random Initial Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> transitionEntry : transitionDistributionMap.entrySet()) { writer.append(transitionEntry.getKey(), transitionEntry.getValue()); } log.info("Wrote random Transition Distribution Map to {}", outFile); for (MapWritable.Entry<Writable, Writable> emissionEntry : emissionDistributionMap.entrySet()) { writer.append(emissionEntry.getKey(), emissionEntry.getValue()); } log.info("Wrote random Emission Distribution Map to {}", outFile); } finally { Closeables.closeQuietly(writer); } } }
From source file:org.apache.nutch.crawl.MimeAdaptiveFetchSchedule.java
License:Apache License
public static void main(String[] args) throws Exception { FetchSchedule fs = new MimeAdaptiveFetchSchedule(); fs.setConf(NutchConfiguration.create()); // we start the time at 0, for simplicity long curTime = 0; long delta = 1000L * 3600L * 24L; // 2 hours // we trigger the update of the page every 30 days long update = 1000L * 3600L * 24L * 30L; // 30 days boolean changed = true; long lastModified = 0; int miss = 0; int totalMiss = 0; int maxMiss = 0; int fetchCnt = 0; int changeCnt = 0; // initial fetchInterval is 10 days CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f); // Set a default MIME-type to test with org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable(); x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8")); p.setMetaData(x);/*from w ww . j a v a2s. c o m*/ p.setFetchTime(0); LOG.info(p.toString()); // let's move the timeline a couple of deltas for (int i = 0; i < 10000; i++) { if (lastModified + update < curTime) { //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime); changed = true; changeCnt++; lastModified = curTime; } LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed " + miss); if (p.getFetchTime() <= curTime) { fetchCnt++; fs.setFetchSchedule(new Text("http://www.example.com"), p, p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY) + " days"); if (!changed) miss++; if (miss > maxMiss) maxMiss = miss; changed = false; totalMiss += miss; miss = 0; } if (changed) miss++; curTime += delta; } LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times."); }
From source file:org.apache.nutch.scoring.nextpage.NextPageAnalysisScoringFilter.java
License:Apache License
private Outlink generateNextPageLink(String baseUrlString, String previousNextPageIndexString) throws MalformedURLException { if (previousNextPageIndexString == null) previousNextPageIndexString = "1"; int lastNextPageIndex = Integer.valueOf(previousNextPageIndexString).intValue(); int nextPageIndex = lastNextPageIndex + 1; Outlink nextPageOutlink = new Outlink("", "nextPage " + nextPageIndex); URL base = new URL(baseUrlString); String file = base.getFile(); String toUrl = base.toString(); if (file.equals("/")) { toUrl = base.toString() + "nextPage/" + nextPageIndex; } else {//w w w . j a v a2 s . c o m toUrl = base.toString().replace(file, "/nextPage/" + nextPageIndex); } nextPageOutlink.setUrl(toUrl); MapWritable outlinkMeta = new MapWritable(); nextPageOutlink.setMetadata(outlinkMeta); outlinkMeta.put(new Text("nextPageIndex"), new IntWritable(nextPageIndex)); return nextPageOutlink; }
From source file:org.apache.nutch.scoring.nextpage.NextPageAnalysisScoringFilter.java
License:Apache License
private void upgradeOutLinks(Content parentContent, Outlink outlink, boolean isNextPageLink) { MapWritable outlinkMeta = outlink.getMetadata(); /* if(isNextPageLink && outlinkMeta.get(new Text("nextPage"))!=null) return;*///No cookies necessary for next page browser driven Metadata parentContentMeta = parentContent.getMetadata(); if (parentContentMeta.get("Cookie") != null) { outlinkMeta.put(new Text("Cookie"), new Text(parentContentMeta.get("Cookie"))); outlinkMeta.put(new Text("CookieDomain"), new Text(parentContentMeta.get("CookieDomain"))); outlinkMeta.put(new Text("CookiePath"), new Text(parentContentMeta.get("CookiePath"))); outlinkMeta.put(new Text("CookieExpiry"), new Text(parentContentMeta.get("CookieExpiry"))); outlinkMeta.put(new Text("CookieSecure"), new Text(parentContentMeta.get("CookieSecure"))); }/*from w w w.ja va 2 s . c o m*/ //we must add a parent url meta , so that the XPathIndexingFilter plugin can use the correct scheme to cast the metas and add them to Nutch document outlinkMeta.put(new Text(PARSE_PARENT_URL), new Text(parentContent.getBaseUrl())); }
From source file:org.apache.pirk.utils.StringUtils.java
License:Apache License
/** * Method to take an input json string and output a MapWritable with arrays as JSON formatted String objects *//*from w w w . j av a2s . c o m*/ public static MapWritable jsonStringToMapWritable(String jsonString) { MapWritable value = new MapWritable(); JSONParser jsonParser = new JSONParser(); try { JSONObject jsonObj = (JSONObject) jsonParser.parse(jsonString); for (Object key : jsonObj.keySet()) { Text mapKey = new Text(key.toString()); Text mapValue = new Text(); if (jsonObj.get(key) != null) { mapValue.set(jsonObj.get(key).toString()); } value.put(mapKey, mapValue); } } catch (ParseException e) { logger.warn("Could not json-decode string: " + jsonString, e); } catch (NumberFormatException e) { logger.warn("Could not parse field into number: " + jsonString, e); } return value; }
From source file:org.apache.pirk.utils.StringUtils.java
License:Apache License
/** * Method to take an input json string and output a MapWritable with arrays as WritableArrayWritable objects *//*from w w w. j a v a 2s. c o m*/ public static MapWritable jsonStringToMapWritableWithWritableArrayWritable(String jsonString, DataSchema dataSchema) { MapWritable value = new MapWritable(); JSONParser jsonParser = new JSONParser(); try { JSONObject jsonObj = (JSONObject) jsonParser.parse(jsonString); for (Object key : jsonObj.keySet()) { Text mapKey = new Text(key.toString()); if (jsonObj.get(key) != null) { logger.debug("key = " + key.toString()); if (dataSchema.isArrayElement((String) key)) { WritableArrayWritable mapValue = StringUtils .jsonArrayStringToWritableArrayWritable(jsonObj.get(key).toString()); value.put(mapKey, mapValue); } else { Text mapValue = new Text(jsonObj.get(key).toString()); value.put(mapKey, mapValue); } } } } catch (ParseException e) { logger.warn("Could not json-decode string: " + jsonString, e); } catch (NumberFormatException e) { logger.warn("Could not parse field into number: " + jsonString, e); } return value; }
From source file:org.apache.pirk.utils.StringUtils.java
License:Apache License
/** * Method to take an input json string and output a MapWritable with arrays as WritableArrayWritable objects *//* w w w . j a v a 2s . co m*/ public static MapWritable jsonStringToMapWritableWithArrayWritable(String jsonString, DataSchema dataSchema) { MapWritable value = new MapWritable(); JSONParser jsonParser = new JSONParser(); try { JSONObject jsonObj = (JSONObject) jsonParser.parse(jsonString); for (Object key : jsonObj.keySet()) { Text mapKey = new Text(key.toString()); if (jsonObj.get(key) != null) { logger.debug("key = " + key.toString()); if (dataSchema.isArrayElement((String) key)) { ArrayWritable mapValue = StringUtils .jsonArrayStringtoArrayWritable(jsonObj.get(key).toString()); value.put(mapKey, mapValue); } else { Text mapValue = new Text(jsonObj.get(key).toString()); value.put(mapKey, mapValue); } } } } catch (ParseException e) { logger.warn("Could not json-decode string: " + jsonString, e); } catch (NumberFormatException e) { logger.warn("Could not parse field into number: " + jsonString, e); } return value; }
From source file:org.apache.sqoop.mapreduce.hcat.SqoopHCatUtilities.java
License:Apache License
public static void configureExportInputFormat(SqoopOptions opts, Job job, ConnManager connMgr, String dbTable, Configuration config) throws IOException { LOG.info("Configuring HCatalog for export job"); SqoopHCatUtilities hCatUtils = SqoopHCatUtilities.instance(); hCatUtils.configureHCat(opts, job, connMgr, dbTable, job.getConfiguration()); job.setInputFormatClass(getInputFormatClass()); Map<String, Integer> dbColTypes = hCatUtils.getDbColumnTypes(); MapWritable columnTypesJava = new MapWritable(); for (Map.Entry<String, Integer> e : dbColTypes.entrySet()) { Text columnName = new Text(e.getKey()); Text columnText = new Text(connMgr.toJavaType(dbTable, e.getKey(), e.getValue())); columnTypesJava.put(columnName, columnText); }/*from ww w . ja va2 s .co m*/ MapWritable columnTypesSql = new MapWritable(); for (Map.Entry<String, Integer> e : dbColTypes.entrySet()) { Text columnName = new Text(e.getKey()); IntWritable sqlType = new IntWritable(e.getValue()); columnTypesSql.put(columnName, sqlType); } DefaultStringifier.store(config, columnTypesJava, SqoopHCatUtilities.HCAT_DB_OUTPUT_COLTYPES_JAVA); DefaultStringifier.store(config, columnTypesSql, SqoopHCatUtilities.HCAT_DB_OUTPUT_COLTYPES_SQL); }
From source file:org.apache.sqoop.mapreduce.JdbcExportJob.java
License:Apache License
@Override protected void configureInputFormat(Job job, String tableName, String tableClassName, String splitByCol) throws ClassNotFoundException, IOException { fileType = getInputFileType();//from www .j a v a 2 s . c om super.configureInputFormat(job, tableName, tableClassName, splitByCol); if (isHCatJob) { SqoopHCatUtilities.configureExportInputFormat(options, job, context.getConnManager(), tableName, job.getConfiguration()); return; } else if (fileType == FileType.AVRO_DATA_FILE) { LOG.debug("Configuring for Avro export"); ConnManager connManager = context.getConnManager(); Map<String, Integer> columnTypeInts; if (options.getCall() == null) { columnTypeInts = connManager.getColumnTypes(tableName, options.getSqlQuery()); } else { columnTypeInts = connManager.getColumnTypesForProcedure(options.getCall()); } MapWritable columnTypes = new MapWritable(); for (Map.Entry<String, Integer> e : columnTypeInts.entrySet()) { Text columnName = new Text(e.getKey()); Text columnText = new Text(connManager.toJavaType(tableName, e.getKey(), e.getValue())); columnTypes.put(columnName, columnText); } DefaultStringifier.store(job.getConfiguration(), columnTypes, AvroExportMapper.AVRO_COLUMN_TYPES_MAP); } }
From source file:org.apache.sqoop.mapreduce.odps.HdfsOdpsImportJob.java
License:Apache License
private void configureGenericRecordExportInputFormat(Job job, String tableName) throws IOException { if (options.getOdpsTable() != null) { MapWritable columnTypes = new MapWritable(); Map<String, OdpsType> colTypeMap = getColTypeMap(); for (Map.Entry<String, OdpsType> e : colTypeMap.entrySet()) { String column = e.getKey(); if (column != null) { Text columnName = new Text(column); Text columnType = new Text(toJavaType(e.getValue())); columnTypes.put(columnName, columnType); }// www .j a v a2 s . c o m } DefaultStringifier.store(job.getConfiguration(), columnTypes, AvroExportMapper.AVRO_COLUMN_TYPES_MAP); return; } ConnManager connManager = context.getConnManager(); Map<String, Integer> columnTypeInts; if (options.getCall() == null) { columnTypeInts = connManager.getColumnTypes(tableName, options.getSqlQuery()); } else { columnTypeInts = connManager.getColumnTypesForProcedure(options.getCall()); } String[] specifiedColumns = options.getColumns(); MapWritable columnTypes = new MapWritable(); for (Map.Entry<String, Integer> e : columnTypeInts.entrySet()) { String column = e.getKey(); column = (specifiedColumns == null) ? column : options.getColumnNameCaseInsensitive(column); if (column != null) { Text columnName = new Text(column); Text columnType = new Text(connManager.toJavaType(tableName, column, e.getValue())); columnTypes.put(columnName, columnType); } } DefaultStringifier.store(job.getConfiguration(), columnTypes, AvroExportMapper.AVRO_COLUMN_TYPES_MAP); }