List of usage examples for org.apache.hadoop.io MapWritable entrySet
@Override
public Set<Map.Entry<Writable, Writable>> entrySet()
From source file:com.dasasian.chok.lucene.integration.LuceneClientTest.java
License:Apache License
@Test public void testGetDetailsConcurrently() throws Exception { miniCluster.deployIndex(LuceneTestResources.INDEX1, 1); LuceneClient client = new LuceneClient(miniCluster.createInteractionProtocol()); final Query query = new QueryParser(Version.LUCENE_30, "", new KeywordAnalyzer()).parse("content: the"); final Hits hits = client.search(query, null, 10); assertNotNull(hits);// ww w . ja v a 2 s . co m assertEquals(10, hits.getHits().size()); List<MapWritable> detailList = client.getDetails(hits.getHits()); assertEquals(hits.getHits().size(), detailList.size()); for (int i = 0; i < detailList.size(); i++) { final MapWritable details1 = client.getDetails(hits.getHits().get(i)); final MapWritable details2 = detailList.get(i); assertEquals(details1.entrySet(), details2.entrySet()); final Set<Writable> keySet = details2.keySet(); assertFalse(keySet.isEmpty()); final Writable writable = details2.get(new Text("path")); assertNotNull(writable); } client.close(); }
From source file:com.digitalpebble.behemoth.solr.LucidWorksWriter.java
License:Apache License
protected SolrInputDocument convertToSOLR(BehemothDocument doc) { final SolrInputDocument inputDoc = new SolrInputDocument(); // map from a Behemoth document to a SOLR one // the field names below should be modified // to match the SOLR schema inputDoc.setField("id", doc.getUrl()); inputDoc.setField("text", doc.getText()); LOG.debug("Adding field : id\t" + doc.getUrl()); //LOG.debug("Adding field : text\t" + doc.getText()); //Rely on LucidWorks field mapping to handle this, or the dynamic fields MapWritable metadata = doc.getMetadata(); if (includeMetadata && metadata != null) { for (Entry<Writable, Writable> entry : metadata.entrySet()) { inputDoc.addField(entry.getKey().toString(), entry.getValue().toString()); }/*from w w w . ja v a 2 s. com*/ } // iterate on the annotations of interest and // create a new field for each one // it is advised NOT to set frequent annotation types // such as token as this would generate a stupidly large // number of fields which won't be used by SOLR for // tokenizing anyway. // what you can do though is to concatenate the token values // to form a new content string separated by spaces // iterate on the annotations if (includeAnnotations) { Iterator<Annotation> iterator = doc.getAnnotations().iterator(); while (iterator.hasNext()) { Annotation current = iterator.next(); // check whether it belongs to a type we'd like to send to SOLR Map<String, String> featureField = fieldMapping.get(current.getType()); if (featureField == null) continue; // iterate on the expected features for (String targetFeature : featureField.keySet()) { String SOLRFieldName = featureField.get(targetFeature); String value = null; // special case for covering text if ("*".equals(targetFeature)) { value = doc.getText().substring((int) current.getStart(), (int) current.getEnd()); } // get the value for the feature else { value = current.getFeatures().get(targetFeature); } LOG.debug("Adding field : " + SOLRFieldName + "\t" + value); // skip if no value has been found if (value != null) inputDoc.setField(SOLRFieldName, value); } } } float boost = 1.0f; inputDoc.setDocumentBoost(boost); return inputDoc; }
From source file:com.digitalpebble.behemoth.solr.SOLRWriter.java
License:Apache License
protected SolrInputDocument convertToSOLR(BehemothDocument doc) { final SolrInputDocument inputDoc = new SolrInputDocument(); // map from a Behemoth document to a SOLR one // the field names below should be modified // to match the SOLR schema inputDoc.setField("id", doc.getUrl()); inputDoc.setField("text", doc.getText()); LOG.debug("Adding field : id\t" + doc.getUrl()); // Rely on the field mapping to handle this, or the dynamic // fields/*from www.j ava2 s . c o m*/ MapWritable metadata = doc.getMetadata(); if (includeMetadata && metadata != null) { for (Entry<Writable, Writable> entry : metadata.entrySet()) { if (useMetadataPrefix) { String key = metadataPrefix + entry.getKey().toString(); inputDoc.addField(key, entry.getValue().toString()); } else { inputDoc.addField(entry.getKey().toString(), entry.getValue().toString()); } } } // iterate on the annotations of interest and // create a new field for each one // it is advised NOT to set frequent annotation types // such as token as this would generate a stupidly large // number of fields which won't be used by SOLR for // tokenizing anyway. // what you can do though is to concatenate the token values // to form a new content string separated by spaces // iterate on the annotations if (includeAnnotations) { Iterator<Annotation> iterator = doc.getAnnotations().iterator(); while (iterator.hasNext()) { Annotation current = iterator.next(); // check whether it belongs to a type we'd like to send to SOLR Map<String, String> featureField = fieldMapping.get(current.getType()); // special case of all annotations if (featureField == null && !includeAllAnnotations) { continue; } if (!includeAllAnnotations) { // iterate on the expected features for (String targetFeature : featureField.keySet()) { String SOLRFieldName = featureField.get(targetFeature); String value = null; // special case for covering text if ("*".equals(targetFeature)) { value = doc.getText().substring((int) current.getStart(), (int) current.getEnd()); } // get the value for the feature else { value = current.getFeatures().get(targetFeature); } LOG.debug("Adding field : " + SOLRFieldName + "\t" + value); // skip if no value has been found if (value != null) inputDoc.addField(SOLRFieldName, value); } } else { for (Entry<String, String> e : current.getFeatures().entrySet()) { inputDoc.addField(annotationPrefix + current.getType() + "." + e.getKey(), e.getValue()); } } } } float boost = 1.0f; inputDoc.setDocumentBoost(boost); return inputDoc; }
From source file:com.facebook.hiveio.mapreduce.output.HiveTools.java
License:Apache License
/** * Map hive record// ww w .j a va 2s . c o m * * @param conf Configuration * @param value data * @return hive record */ public static HiveWritableRecord mapToHiveRecord(Configuration conf, MapWritable value) { HiveTableSchema schema = HiveTableSchemas.lookup(conf, getHiveTableName()); HiveWritableRecord record = HiveRecordFactory.newWritableRecord(schema); for (Map.Entry<Writable, Writable> entry : value.entrySet()) { IntWritable intKey = (IntWritable) entry.getKey(); LongWritable longValue = (LongWritable) entry.getValue(); record.set(intKey.get(), longValue.get()); } return record; }
From source file:com.jfolson.hive.serde.RTypedBytesWritableOutput.java
License:Apache License
public void writeMap(MapWritable mw) throws IOException { out.writeMapHeader(mw.size());/*from www . ja v a 2s . co m*/ for (Map.Entry<Writable, Writable> entry : mw.entrySet()) { write(entry.getKey()); write(entry.getValue()); } }
From source file:com.redgate.hadoop.hive.azuretables.AzureTablesRecordWriter.java
License:Apache License
/** * Writes a MapWriteable out to the Azure Table *//* w w w . j a va 2 s . com*/ public void write(Writable w) throws IOException { MapWritable map = (MapWritable) w; properties.clear(); for (Entry<Writable, Writable> e : map.entrySet()) { // TODO - more intelligent type mapping (make everything a string is // hardly subtle EntityProperty value = new EntityProperty(e.getValue().toString()); properties.put(e.getKey().toString(), value); } DynamicTableEntity entity = new DynamicTableEntity(properties); entity.setPartitionKey(partitionKey); entity.setRowKey(UUID.randomUUID().toString()); TableOperation op = TableOperation.insert(entity); try { tableClient.execute(table, op); } catch (StorageException e) { throw new IOException(e); } }
From source file:com.TCG.Nutch_DNS.HostDbReducer.java
License:Apache License
public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { CrawlDatum fetch = new CrawlDatum(); CrawlDatum old = new CrawlDatum(); boolean fetchSet = false; boolean oldSet = false; byte[] signature = null; boolean multiple = false; // avoid deep copy when only single value exists linked.clear();//from www .j a v a2 s .c o m org.apache.hadoop.io.MapWritable metaFromParse = null; while (values.hasNext()) { CrawlDatum datum = values.next(); if (!multiple && values.hasNext()) multiple = true; if (CrawlDatum.hasDbStatus(datum)) { if (!oldSet) { if (multiple) { old.set(datum); } else { // no need for a deep copy - this is the only value old = datum; } oldSet = true; } else { // always take the latest version if (old.getFetchTime() < datum.getFetchTime()) old.set(datum); } continue; } if (CrawlDatum.hasFetchStatus(datum)) { if (!fetchSet) { if (multiple) { fetch.set(datum); } else { fetch = datum; } fetchSet = true; } else { // always take the latest version if (fetch.getFetchTime() < datum.getFetchTime()) fetch.set(datum); } continue; } switch (datum.getStatus()) { // collect other info case CrawlDatum.STATUS_LINKED: CrawlDatum link; if (multiple) { link = new CrawlDatum(); link.set(datum); } else { link = datum; } linked.insert(link); break; case CrawlDatum.STATUS_SIGNATURE: signature = datum.getSignature(); break; case CrawlDatum.STATUS_PARSE_META: metaFromParse = datum.getMetaData(); break; default: LOG.warn("Unknown status, key: " + key + ", datum: " + datum); } } // copy the content of the queue into a List // in reversed order int numLinks = linked.size(); List<CrawlDatum> linkList = new ArrayList<CrawlDatum>(numLinks); for (int i = numLinks - 1; i >= 0; i--) { linkList.add(linked.pop()); } // if it doesn't already exist, skip it if (!oldSet && !additionsAllowed) return; // if there is no fetched datum, perhaps there is a link if (!fetchSet && linkList.size() > 0) { fetch = linkList.get(0); fetchSet = true; } // still no new data - record only unchanged old data, if exists, and return if (!fetchSet) { if (oldSet) {// at this point at least "old" should be present output.collect(key, old); reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(old.getStatus())).increment(1); } else { LOG.warn("Missing fetch and old value, signature=" + signature); } return; } if (signature == null) signature = fetch.getSignature(); long prevModifiedTime = oldSet ? old.getModifiedTime() : 0L; long prevFetchTime = oldSet ? old.getFetchTime() : 0L; // initialize with the latest version, be it fetch or link result.set(fetch); if (oldSet) { // copy metadata from old, if exists if (old.getMetaData().size() > 0) { result.putAllMetaData(old); // overlay with new, if any if (fetch.getMetaData().size() > 0) result.putAllMetaData(fetch); } // set the most recent valid value of modifiedTime if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) { result.setModifiedTime(old.getModifiedTime()); } } switch (fetch.getStatus()) { // determine new status case CrawlDatum.STATUS_LINKED: // it was link if (oldSet) { // if old exists result.set(old); // use it } else { result = schedule.initializeSchedule(key, result); result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); try { scfilters.initialScore(key, result); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage()); } result.setScore(0.0f); } } break; case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected case CrawlDatum.STATUS_FETCH_REDIR_PERM: case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified // determine the modification status int modified = FetchSchedule.STATUS_UNKNOWN; if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) { modified = FetchSchedule.STATUS_NOTMODIFIED; } else if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) { // only successful fetches (but not redirects, NUTCH-1422) // are detected as "not modified" by signature comparison if (oldSet && old.getSignature() != null && signature != null) { if (SignatureComparator._compare(old.getSignature(), signature) != 0) { modified = FetchSchedule.STATUS_MODIFIED; } else { modified = FetchSchedule.STATUS_NOTMODIFIED; } } } // set the schedule result = schedule.setFetchSchedule(key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime(), fetch.getModifiedTime(), modified); // set the result status and signature if (modified == FetchSchedule.STATUS_NOTMODIFIED) { result.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED); // NUTCH-1341 The page is not modified according to its signature, let's // reset lastModified as well result.setModifiedTime(prevModifiedTime); if (oldSet) result.setSignature(old.getSignature()); } else { switch (fetch.getStatus()) { case CrawlDatum.STATUS_FETCH_SUCCESS: result.setStatus(CrawlDatum.STATUS_DB_FETCHED); break; case CrawlDatum.STATUS_FETCH_REDIR_PERM: result.setStatus(CrawlDatum.STATUS_DB_REDIR_PERM); break; case CrawlDatum.STATUS_FETCH_REDIR_TEMP: result.setStatus(CrawlDatum.STATUS_DB_REDIR_TEMP); break; default: LOG.warn("Unexpected status: " + fetch.getStatus() + " resetting to old status."); if (oldSet) result.setStatus(old.getStatus()); else result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); } result.setSignature(signature); } // https://issues.apache.org/jira/browse/NUTCH-1656 if (metaFromParse != null) { for (Entry<Writable, Writable> e : metaFromParse.entrySet()) { result.getMetaData().put(e.getKey(), e.getValue()); } } // if fetchInterval is larger than the system-wide maximum, trigger // an unconditional recrawl. This prevents the page to be stuck at // NOTMODIFIED state, when the old fetched copy was already removed with // old segments. if (maxInterval < result.getFetchInterval()) result = schedule.forceRefetch(key, result, false); break; case CrawlDatum.STATUS_SIGNATURE: if (LOG.isWarnEnabled()) { LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key); } return; case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure if (oldSet) { result.setSignature(old.getSignature()); // use old signature } result = schedule.setPageRetrySchedule(key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime()); if (result.getRetriesSinceFetch() < retryMax) { result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); } else { result.setStatus(CrawlDatum.STATUS_DB_GONE); result = schedule.setPageGoneSchedule(key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime()); } break; case CrawlDatum.STATUS_FETCH_GONE: // permanent failure if (oldSet) result.setSignature(old.getSignature()); // use old signature result.setStatus(CrawlDatum.STATUS_DB_GONE); result = schedule.setPageGoneSchedule(key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime()); break; default: throw new RuntimeException("Unknown status: " + fetch.getStatus() + " " + key); } try { scfilters.updateDbScore(key, oldSet ? old : null, result, linkList); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't update score, key=" + key + ": " + e); } } // remove generation time, if any result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY); output.collect(key, result); reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(result.getStatus())).increment(1); }
From source file:io.apigee.lembos.mapreduce.converters.input.MapWritableConverter.java
License:Apache License
/** * Takes in a {@link MapWritable} and returns a {@link Scriptable} map. * * @param scope the JavaScript scope/*w ww.j av a 2 s .c om*/ * @param writable the value to convert * * @return the {@link Scriptable} map equivalent */ @Override public Object toJavaScript(final Scriptable scope, final MapWritable writable) { final Map<Object, Object> writableMap = new HashMap<>(); for (final Map.Entry<Writable, Writable> mapEntry : writable.entrySet()) { writableMap.put(ConversionUtils.writableToJS(mapEntry.getKey(), scope), ConversionUtils.writableToJS(mapEntry.getValue(), scope)); } return JavaScriptUtils.asObject(scope, writableMap); }
From source file:net.sf.katta.integrationTest.lib.lucene.LuceneClientTest.java
License:Apache License
@Test public void testGetDetailsConcurrently() throws KattaException, ParseException, InterruptedException { deployTestIndices(1, 1);/*from w ww . j av a 2 s .c o m*/ ILuceneClient client = new LuceneClient(_miniCluster.getZkConfiguration()); final Query query = new QueryParser(Version.LUCENE_35, "", new KeywordAnalyzer()).parse("content: the"); final Hits hits = client.search(query, new String[] { INDEX_NAME }, 10); assertNotNull(hits); assertEquals(10, hits.getHits().size()); List<MapWritable> detailList = client.getDetails(hits.getHits()); assertEquals(hits.getHits().size(), detailList.size()); for (int i = 0; i < detailList.size(); i++) { final MapWritable details1 = client.getDetails(hits.getHits().get(i)); final MapWritable details2 = detailList.get(i); assertEquals(details1.entrySet(), details2.entrySet()); final Set<Writable> keySet = details2.keySet(); assertFalse(keySet.isEmpty()); final Writable writable = details2.get(new Text("path")); assertNotNull(writable); } client.close(); }
From source file:nutchIndexer.NutchReduce.java
License:Open Source License
@Override public void reduce(IntWritable docId, Iterable<MapWritable> documentsAnalyzed, Context context) throws IOException, InterruptedException { for (MapWritable documentAnalyzed : documentsAnalyzed) { for (MapWritable.Entry<Writable, Writable> termEntry : documentAnalyzed.entrySet()) { Text term = (Text) termEntry.getKey(); IntWritable freq = (IntWritable) termEntry.getValue(); Integer documentId = docId.get(); this.invertedIndex.addPosting(term, documentId, freq); }/* www . j ava 2 s . c om*/ } }