List of usage examples for org.apache.hadoop.io Text equals
@Override public boolean equals(Object o)
o
is a Text with the same contents. From source file:org.apache.accumulo.test.iterator.WholeRowIteratorTest.java
License:Apache License
private static TreeMap<Key, Value> createOutputData() { TreeMap<Key, Value> data = new TreeMap<>(); Text row = null; List<Key> keys = new ArrayList<>(); List<Value> values = new ArrayList<>(); // Generate the output data from the input data for (Entry<Key, Value> entry : INPUT_DATA.entrySet()) { if (null == row) { row = entry.getKey().getRow(); }//from w w w . j a v a 2 s . co m if (!row.equals(entry.getKey().getRow())) { // Moved to the next row try { // Serialize and save Value encoded = WholeRowIterator.encodeRow(keys, values); data.put(new Key(row), encoded); } catch (IOException e) { throw new RuntimeException(e); } // Empty the aggregated k-v's keys = new ArrayList<>(); values = new ArrayList<>(); // Set the new current row row = entry.getKey().getRow(); } // Aggregate the current row keys.add(entry.getKey()); values.add(entry.getValue()); } if (!keys.isEmpty()) { try { Value encoded = WholeRowIterator.encodeRow(keys, values); data.put(new Key(row), encoded); } catch (IOException e) { throw new RuntimeException(e); } } return data; }
From source file:org.apache.accumulo.test.performance.scan.CollectTabletStats.java
License:Apache License
private static void calcTabletStats(Connector conn, String table, Authorizations auths, int batchSize, KeyExtent ke, String[] columns) throws Exception { // long t1 = System.currentTimeMillis(); Scanner scanner = conn.createScanner(table, auths); scanner.setBatchSize(batchSize);/*from w w w . j a v a 2 s. c o m*/ scanner.setRange(new Range(ke.getPrevEndRow(), false, ke.getEndRow(), true)); for (String c : columns) { scanner.fetchColumnFamily(new Text(c)); } Stat rowLen = new Stat(); Stat cfLen = new Stat(); Stat cqLen = new Stat(); Stat cvLen = new Stat(); Stat valLen = new Stat(); Stat colsPerRow = new Stat(); Text lastRow = null; int colsPerRowCount = 0; for (Entry<Key, Value> entry : scanner) { Key key = entry.getKey(); Text row = key.getRow(); if (lastRow == null) { lastRow = row; } if (!lastRow.equals(row)) { colsPerRow.addStat(colsPerRowCount); lastRow = row; colsPerRowCount = 0; } colsPerRowCount++; rowLen.addStat(row.getLength()); cfLen.addStat(key.getColumnFamilyData().length()); cqLen.addStat(key.getColumnQualifierData().length()); cvLen.addStat(key.getColumnVisibilityData().length()); valLen.addStat(entry.getValue().get().length); } synchronized (System.out) { System.out.println(""); System.out.println("\tTablet " + ke.getUUID() + " statistics : "); printStat("Row length", rowLen); printStat("Column family length", cfLen); printStat("Column qualifier length", cqLen); printStat("Column visibility length", cvLen); printStat("Value length", valLen); printStat("Columns per row", colsPerRow); System.out.println(""); } }
From source file:org.apache.accumulo.tserver.tablet.TabletData.java
License:Apache License
public TabletData(KeyExtent extent, VolumeManager fs, Iterator<Entry<Key, Value>> entries) { final Text family = new Text(); Text rowName = extent.getMetadataEntry(); while (entries.hasNext()) { Entry<Key, Value> entry = entries.next(); Key key = entry.getKey(); Value value = entry.getValue();/*from w w w .ja v a2 s.co m*/ key.getColumnFamily(family); if (key.compareRow(rowName) != 0) { log.info("Unexpected metadata table entry for {}: {}", extent, key.getRow()); continue; } if (ServerColumnFamily.TIME_COLUMN.hasColumns(entry.getKey())) { if (time == null) { time = value.toString(); } } else if (DataFileColumnFamily.NAME.equals(family)) { FileRef ref = new FileRef(fs, key); dataFiles.put(ref, new DataFileValue(entry.getValue().get())); } else if (DIRECTORY_COLUMN.hasColumns(key)) { directory = value.toString(); } else if (family.equals(LogColumnFamily.NAME)) { logEntris.add(LogEntry.fromKeyValue(key, entry.getValue())); } else if (family.equals(ScanFileColumnFamily.NAME)) { scanFiles.add(new FileRef(fs, key)); } else if (FLUSH_COLUMN.hasColumns(key)) { flushID = Long.parseLong(value.toString()); } else if (COMPACT_COLUMN.hasColumns(key)) { compactID = Long.parseLong(entry.getValue().toString()); } else if (family.equals(LastLocationColumnFamily.NAME)) { lastLocation = new TServerInstance(value, key.getColumnQualifier()); } else if (family.equals(BulkFileColumnFamily.NAME)) { Long id = Long.decode(value.toString()); List<FileRef> lst = bulkImported.get(id); if (lst == null) { bulkImported.put(id, lst = new ArrayList<>()); } lst.add(new FileRef(fs, key)); } else if (PREV_ROW_COLUMN.hasColumns(key)) { KeyExtent check = new KeyExtent(key.getRow(), value); if (!check.equals(extent)) { throw new RuntimeException("Found bad entry for " + extent + ": " + check); } } } if (time == null && dataFiles.isEmpty() && extent.equals(RootTable.OLD_EXTENT)) { // recovery... old root tablet has no data, so time doesn't matter: time = TabletTime.LOGICAL_TIME_ID + "" + Long.MIN_VALUE; } }
From source file:org.apache.accumulo.tserver.TabletServer.java
License:Apache License
static Value checkTabletMetadata(KeyExtent extent, TServerInstance instance, SortedMap<Key, Value> tabletsKeyValues, Text metadataEntry) throws AccumuloException { TServerInstance future = null;/*from w ww. java2s . c om*/ Value prevEndRow = null; Value dir = null; Value time = null; for (Entry<Key, Value> entry : tabletsKeyValues.entrySet()) { Key key = entry.getKey(); if (!metadataEntry.equals(key.getRow())) { log.info("Unexpected row in tablet metadata " + metadataEntry + " " + key.getRow()); return null; } Text cf = key.getColumnFamily(); if (cf.equals(TabletsSection.FutureLocationColumnFamily.NAME)) { if (future != null) { throw new AccumuloException("Tablet has multiple future locations " + extent); } future = new TServerInstance(entry.getValue(), key.getColumnQualifier()); } else if (cf.equals(TabletsSection.CurrentLocationColumnFamily.NAME)) { log.info("Tablet seems to be already assigned to " + new TServerInstance(entry.getValue(), key.getColumnQualifier())); return null; } else if (TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.hasColumns(key)) { prevEndRow = entry.getValue(); } else if (TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) { dir = entry.getValue(); } else if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(key)) { time = entry.getValue(); } } if (prevEndRow == null) { throw new AccumuloException("Metadata entry does not have prev row (" + metadataEntry + ")"); } else { KeyExtent ke2 = new KeyExtent(metadataEntry, prevEndRow); if (!extent.equals(ke2)) { log.info("Tablet prev end row mismatch " + extent + " " + ke2.getPrevEndRow()); return null; } } if (dir == null) { throw new AccumuloException("Metadata entry does not have directory (" + metadataEntry + ")"); } if (time == null && !extent.equals(RootTable.OLD_EXTENT)) { throw new AccumuloException("Metadata entry does not have time (" + metadataEntry + ")"); } if (future == null) { log.info("The master has not assigned " + extent + " to " + instance); return null; } if (!instance.equals(future)) { log.info("Table " + extent + " has been assigned to " + future + " which is not " + instance); return null; } return dir; }
From source file:org.apache.giraph.hadoop.BspTokenSelector.java
License:Apache License
@SuppressWarnings("unchecked") @Override// w ww. j a v a 2 s . c o m public Token<JobTokenIdentifier> selectToken(Text service, Collection<Token<? extends TokenIdentifier>> tokens) { if (service == null) { return null; } Text kindName = new Text("mapreduce.job"); for (Token<? extends TokenIdentifier> token : tokens) { if (kindName.equals(token.getKind())) { return (Token<JobTokenIdentifier>) token; } } return null; }
From source file:org.apache.gora.accumulo.store.AccumuloStore.java
License:Apache License
public ByteSequence populate(Iterator<Entry<Key, Value>> iter, T persistent) throws IOException { ByteSequence row = null;/*w w w. ja va2 s . com*/ Map<Utf8, Object> currentMap = null; List currentArray = null; Text currentFam = null; int currentPos = 0; Schema currentSchema = null; Field currentField = null; BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(new byte[0], null); while (iter.hasNext()) { Entry<Key, Value> entry = iter.next(); if (row == null) { row = entry.getKey().getRowData(); } byte[] val = entry.getValue().get(); Field field = fieldMap.get(getFieldName(entry)); if (currentMap != null) { if (currentFam.equals(entry.getKey().getColumnFamily())) { currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get())); continue; } else { persistent.put(currentPos, currentMap); currentMap = null; } } else if (currentArray != null) { if (currentFam.equals(entry.getKey().getColumnFamily())) { currentArray.add(fromBytes(currentSchema, entry.getValue().get())); continue; } else { persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray)); currentArray = null; } } switch (field.schema().getType()) { case MAP: // first entry only. Next are handled above on the next loop currentMap = new DirtyMapWrapper<Utf8, Object>(new HashMap<Utf8, Object>()); currentPos = field.pos(); currentFam = entry.getKey().getColumnFamily(); currentSchema = field.schema().getValueType(); currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get())); break; case ARRAY: currentArray = new DirtyListWrapper<Object>(new ArrayList<Object>()); currentPos = field.pos(); currentFam = entry.getKey().getColumnFamily(); currentSchema = field.schema().getElementType(); currentField = field; currentArray.add(fromBytes(currentSchema, entry.getValue().get())); break; case UNION:// default value of null acts like union with null Schema effectiveSchema = field.schema().getTypes().get(firstNotNullSchemaTypeIndex(field.schema())); // map and array were coded without union index so need to be read the same way if (effectiveSchema.getType() == Type.ARRAY) { currentArray = new DirtyListWrapper<Object>(new ArrayList<Object>()); currentPos = field.pos(); currentFam = entry.getKey().getColumnFamily(); currentSchema = field.schema().getElementType(); currentField = field; currentArray.add(fromBytes(currentSchema, entry.getValue().get())); break; } else if (effectiveSchema.getType() == Type.MAP) { currentMap = new DirtyMapWrapper<Utf8, Object>(new HashMap<Utf8, Object>()); currentPos = field.pos(); currentFam = entry.getKey().getColumnFamily(); currentSchema = effectiveSchema.getValueType(); currentMap.put(new Utf8(entry.getKey().getColumnQualifierData().toArray()), fromBytes(currentSchema, entry.getValue().get())); break; } // continue like a regular top-level union case RECORD: SpecificDatumReader<?> reader = new SpecificDatumReader<Schema>(field.schema()); persistent.put(field.pos(), reader.read(null, DecoderFactory.get().binaryDecoder(val, decoder))); break; default: persistent.put(field.pos(), fromBytes(field.schema(), entry.getValue().get())); } } if (currentMap != null) { persistent.put(currentPos, currentMap); } else if (currentArray != null) { persistent.put(currentPos, new GenericData.Array<T>(currentField.schema(), currentArray)); } persistent.clearDirty(); return row; }
From source file:org.apache.mahout.clustering.minhash.LastfmClusterEvaluator.java
License:Apache License
/** * Calculate the overall cluster precision by sampling clusters. Precision is * calculated as follows :-//from www . j av a 2s. c o m * * 1. For a sample of all the clusters calculate the pair-wise similarity * (Jaccard coefficient) for items in the same cluster. * * 2. Count true positives as items whose similarity is above specified * threshold. * * 3. Precision = (true positives) / (total items in clusters sampled). * * @param clusterFile * The file containing cluster information * @param threshold * Similarity threshold for containing two items in a cluster to be * relevant. Must be between 0.0 and 1.0 * @param samplePercentage * Percentage of clusters to sample. Must be between 0.0 and 1.0 */ private static void testPrecision(Path clusterFile, double threshold, double samplePercentage) { Configuration conf = new Configuration(); Random rand = RandomUtils.getRandom(); Text prevCluster = new Text(); List<List<Integer>> listenerVectors = Lists.newArrayList(); long similarListeners = 0; long allListeners = 0; int clustersProcessed = 0; for (Pair<Text, VectorWritable> record : new SequenceFileIterable<Text, VectorWritable>(clusterFile, true, conf)) { Text cluster = record.getFirst(); VectorWritable point = record.getSecond(); if (!cluster.equals(prevCluster)) { // We got a new cluster prevCluster.set(cluster.toString()); // Should we check previous cluster ? if (rand.nextDouble() > samplePercentage) { listenerVectors.clear(); continue; } int numListeners = listenerVectors.size(); allListeners += numListeners; for (int i = 0; i < numListeners; i++) { List<Integer> listenerVector1 = listenerVectors.get(i); for (int j = i + 1; j < numListeners; j++) { List<Integer> listenerVector2 = listenerVectors.get(j); double similarity = computeSimilarity(listenerVector1, listenerVector2); similarListeners += similarity >= threshold ? 1 : 0; } } listenerVectors.clear(); clustersProcessed++; System.out.print('\r' + usedMemory() + " Clusters processed: " + clustersProcessed); } List<Integer> listeners = Lists.newArrayList(); for (Vector.Element ele : point.get()) { listeners.add((int) ele.get()); } listenerVectors.add(listeners); } System.out.println("\nTest Results"); System.out.println("============="); System.out.println(" (A) Listeners in same cluster with simiarity above threshold (" + threshold + ") : " + similarListeners); System.out.println(" (B) All listeners: " + allListeners); NumberFormat format = NumberFormat.getInstance(); format.setMaximumFractionDigits(2); double precision = (double) similarListeners / allListeners * 100.0; System.out.println(" Average cluster precision: A/B = " + format.format(precision)); }
From source file:org.apache.nutch.fetcher.FetcherThread.java
License:Apache License
private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException { datum.setStatus(status);//from w ww .j a v a 2 s. c o m datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // store the guessed content type in the crawldatum if (content.getContentType() != null) datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType())); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(), Thread.currentThread().getId(), key, e); } } if (status == CrawlDatum.STATUS_FETCH_SUCCESS) { if (parsing && !(skipTruncated && ParseSegment.isTruncated(content))) { try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("{} {} Error parsing: {}: {}", getName(), Thread.currentThread().getId(), key, StringUtils.stringifyException(e)); } } if (parseResult == null && (parsing || signatureWithoutParsing)) { byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* * Store status code in content So we can read this value during parsing * (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { context.write(key, new NutchWritable(datum)); if (content != null && storingContent) context.write(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); ParseData parseData = parse.getData(); if (!parseStatus.isSuccess()) { LOG.warn("{} {} Error parsing: {}: {}", getName(), Thread.currentThread().getId(), key, parseStatus); parse = parseStatus.getEmptyParse(conf); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse); // Ensure segment name and score are in parseData metadata parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(), Thread.currentThread().getId(), key, e); } } String origin = null; // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(url.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { origin = URLUtil.getDomainName(originURL).toLowerCase(); } // use host else { origin = originURL.getHost().toLowerCase(); } } //used by fetchNode if (fetchNode != null) { fetchNode.setOutlinks(links); fetchNode.setTitle(parseData.getTitle()); FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode); } int validCount = 0; // Process all outlinks, normalize, filter and deduplicate List<Outlink> outlinkList = new ArrayList<>(outlinksToStore); HashSet<String> outlinks = new HashSet<>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); if (toUrl.length() > maxOutlinkLength) { continue; } toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks); if (toUrl == null) { continue; } validCount++; links[i].setUrl(toUrl); outlinkList.add(links[i]); outlinks.add(toUrl); } //Publish fetch report event if (activatePublisher) { FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString()); reportEvent.addOutlinksToEventData(outlinkList); reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle()); reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type")); reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore()); reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime()); reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language")); publisher.publish(reportEvent, conf); } // Only process depth N outlinks if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { FetchItem ft = FetchItem.create(url, null, queueMode); FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID); queue.alreadyFetched.add(url.toString().hashCode()); context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; String followUrl; // Walk over the outlinks and add as new FetchItem to the queues Iterator<String> iter = outlinks.iterator(); while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) { followUrl = iter.next(); // Check whether we'll follow external outlinks if (outlinksIgnoreExternal) { if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) { continue; } } // Already followed? int urlHashCode = followUrl.hashCode(); if (queue.alreadyFetched.contains(urlHashCode)) { continue; } queue.alreadyFetched.add(urlHashCode); // Create new FetchItem with depth incremented FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); context.getCounter("FetcherOutlinks", "outlinks_following").increment(1); ((FetchItemQueues) fetchQueues).addFetchItem(fit); outlinkCounter++; } } // Overwrite the outlinks in ParseData with the normalized and // filtered set parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()])); context.write(url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:", e); } } // return parse status (of the "original" URL if the ParseResult contains // multiple parses) which allows Fetcher to follow meta-redirects if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]) .increment(1); return p.getData().getStatus(); } } return null; }
From source file:org.apache.nutch.tools.arc.ArcSegmentCreator.java
License:Apache License
/** * <p>Parses the raw content of a single record to create output. This method * is almost the same as the {@link org.apache.nutch.Fetcher#output} method in * terms of processing and output. //from w ww . j a v a2 s. co m * * @param output The job output collector. * @param segmentName The name of the segment to create. * @param key The url of the record. * @param datum The CrawlDatum of the record. * @param content The raw content of the record * @param pstatus The protocol status * @param status The fetch status. * * @return The result of the parse in a ParseStatus object. */ private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName, Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) { // set the fetch status and the fetch time datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } try { // parse the content parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } // set the content signature if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, new ParseStatus().getEmptyParse(getConf())); datum.setSignature(signature); } try { output.collect(key, new NutchWritable(datum)); output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(getConf()); } // Calculate page signature. byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); // Ensure segment name and score are in parseData metadata parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } output.collect(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException(e)); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { return p.getData().getStatus(); } } } return null; }
From source file:org.apache.rya.export.accumulo.parent.AccumuloParentMetadataRepository.java
License:Apache License
private MergeParentMetadata getMetadataFromTable() throws ParentMetadataDoesNotExistException { try {/*from w w w . j ava 2 s . c o m*/ // Create an Accumulo scanner that iterates through the metadata entries. final Scanner scanner = connector.createScanner(mergeParentMetadataTableName, new Authorizations()); final Iterator<Entry<Key, Value>> entries = scanner.iterator(); // No metadata has been stored in the table yet. if (!entries.hasNext()) { log.error("Could not find any MergeParentMetadata metadata in the table named: " + mergeParentMetadataTableName); } // Fetch the metadata from the entries. String ryaInstanceName = null; Date timestamp = null; Date filterTimestamp = null; Long parentTimeOffset = null; while (entries.hasNext()) { final Entry<Key, Value> entry = entries.next(); final Text columnQualifier = entry.getKey().getColumnQualifier(); final byte[] value = entry.getValue().get(); if (columnQualifier.equals(MERGE_PARENT_METADATA_RYA_INSTANCE_NAME)) { ryaInstanceName = STRING_LEXICODER.decode(value); } else if (columnQualifier.equals(MERGE_PARENT_METADATA_TIMESTAMP)) { timestamp = DATE_LEXICODER.decode(value); } else if (columnQualifier.equals(MERGE_PARENT_METADATA_FILTER_TIMESTAMP)) { filterTimestamp = DATE_LEXICODER.decode(value); } else if (columnQualifier.equals(MERGE_PARENT_METADATA_PARENT_TIME_OFFSET)) { parentTimeOffset = LONG_LEXICODER.decode(value); } } return new MergeParentMetadata(ryaInstanceName, timestamp, filterTimestamp, parentTimeOffset); } catch (final TableNotFoundException e) { throw new ParentMetadataDoesNotExistException( "Could not add results to a MergeParentMetadata because the MergeParentMetadata table does not exist.", e); } catch (final Exception e) { throw new ParentMetadataDoesNotExistException("Error occurred while getting merge parent metadata.", e); } }