List of usage examples for org.apache.solr.common SolrInputDocument getFieldValues
@Override
public Collection<Object> getFieldValues(String name)
From source file:com.francelabs.datafari.updateprocessor.DatafariUpdateProcessor.java
License:Apache License
@Override public void processAdd(final AddUpdateCommand cmd) throws IOException { final SolrInputDocument doc = cmd.getSolrInputDocument(); // Sometimes Tika put several ids so we keep the first one which is // always the right one if (doc.getFieldValues("id").size() > 1) { final Object id = doc.getFieldValue("id"); doc.remove("id"); doc.addField("id", id); }//from w w w .j a v a 2s.com // Try to retrieve at the ignored_filelastmodified field to set it's // value in the last_modified field if (doc.getFieldValue("ignored_filelastmodified") != null) { final Object last_modified = doc.getFieldValue("ignored_filelastmodified"); doc.remove("last_modified"); doc.addField("last_modified", last_modified); } // Sometimes Tika put several last_modified dates, so we keep the first // one which is always the right one if ((doc.getFieldValues("last_modified") != null) && (doc.getFieldValues("last_modified").size() > 1)) { final Object last_modified = doc.getFieldValue("last_modified"); doc.remove("last_modified"); doc.addField("last_modified", last_modified); } final String url = (String) doc.getFieldValue("id"); // Create path hierarchy for facet final List<String> urlHierarchy = new ArrayList<>(); /* * // Create path hierarchy for facet * * final List<String> urlHierarchy = new ArrayList<String>(); * * final String path = url.replace("file:", ""); int previousIndex = 1; int * depth = 0; // Tokenize the path and add the depth as first character for * each token // (like: 0/home, 1/home/project ...) for (int i = 0; i < * path.split("/").length - 2; i++) { int endIndex = path.indexOf('/', * previousIndex); if (endIndex == -1) { endIndex = path.length() - 1; } * urlHierarchy.add(depth + path.substring(0, endIndex)); depth++; * previousIndex = endIndex + 1; } * * // Add the tokens to the urlHierarchy field doc.addField("urlHierarchy", * urlHierarchy); */ doc.addField("url", url); String filename = ""; final SolrInputField streamNameField = doc.get("ignored_stream_name"); if (streamNameField != null) { filename = (String) streamNameField.getFirstValue(); } else { final Pattern pattern = Pattern.compile("[^/]*$"); final Matcher matcher = pattern.matcher(url); if (matcher.find()) { filename = matcher.group(); } } if (url.startsWith("http")) { if (doc.get("title") == null) { doc.addField("title", filename); } doc.addField("source", "web"); } if (url.startsWith("file")) { doc.removeField("title"); doc.addField("title", filename); doc.addField("source", "file"); } String extension = ""; URL urlObject = new URL(url); String path = urlObject.getPath(); final SolrInputField mimeTypeField = doc.get("ignored_content_type"); String nameExtension = FilenameUtils.getExtension(path); String tikaExtension = mimeTypeField == null ? "" : extensionFromMimeTypeField(mimeTypeField); if (extensionFromName) { extension = nameExtension.length() > 1 && nameExtension.length() < 5 ? nameExtension : tikaExtension; } else { extension = tikaExtension.length() > 1 && tikaExtension.length() < 5 ? tikaExtension : nameExtension; } /* if (extensionFromName || mimeTypeField == null) { if (path.contains(".")){ extension = FilenameUtils.getExtension(path); if (extension.length() > 4 || extension.length() < 1) { // If length is too long, try extracting from tika information if available String tryExtension = mimeTypeField==null ? null : extensionFromMimeTypeField(mimeTypeField); if (tryExtension != null) { extension = tryExtension; } else { // Else default to bin for anything else extension = "bin"; } } } else if (urlObject.getProtocol().equals("http") || urlObject.getProtocol().equals("https")) { extension = null; if (mimeTypeField != null) { extension = extensionFromMimeTypeField(mimeTypeField); } if (extension == null) { extension = "html"; } } } else { extension = extensionFromMimeTypeField(mimeTypeField); if (extension == null) { extension = FilenameUtils.getExtension(path); } } */ doc.addField("extension", extension.toLowerCase()); super.processAdd(cmd); }
From source file:com.ifactory.press.db.solr.processor.FieldMergingProcessor.java
License:Apache License
@Override public void processAdd(AddUpdateCommand cmd) throws IOException { if (sourceAnalyzers != null && destinationField != null) { SolrInputDocument doc = cmd.getSolrInputDocument(); for (Map.Entry<String, PoolingAnalyzerWrapper> entry : sourceAnalyzers.entrySet()) { String sourceFieldName = entry.getKey(); Analyzer fieldAnalyzer = entry.getValue(); Collection<Object> fieldValues = doc.getFieldValues(sourceFieldName); if (fieldValues != null) { for (Object value : fieldValues) { IndexableField fieldValue = new TextField(destinationField, fieldAnalyzer.tokenStream(sourceFieldName, value.toString())); doc.addField(destinationField, fieldValue); }/*from ww w.j ava2 s. c o m*/ } } } if (next != null) next.processAdd(cmd); // and then release all the analyzers, readying them for re-use for (Map.Entry<String, PoolingAnalyzerWrapper> entry : sourceAnalyzers.entrySet()) { entry.getValue().release(); } }
From source file:com.plugtree.solrmeter.extractor.FileInputDocumentExtractorTestCase.java
License:Apache License
public void testSingleDoc() throws FileNotFoundException { FileInputDocumentExtractorSpy extractor = new FileInputDocumentExtractorSpy( FileUtils.findFileAsString("FileInputDocumentExtractorTestCase1.txt")); assertEquals(1, extractor.getParsedDocuments().size()); for (int i = 0; i < 10; i++) { SolrInputDocument document = extractor.getRandomDocument(); // fieldName1=value1;fieldName2=value2;fieldName3=value3 Iterator<Object> values = document.getFieldValues("fieldName1").iterator(); assertEquals("value1", values.next()); assertEquals("value2", values.next()); assertEquals("value2", document.getFieldValue("fieldName2")); assertEquals("value3", document.getFieldValue("fieldName3")); }//from w ww . j a va 2s.c o m }
From source file:com.talis.rdf.solr.DefaultDocumentBuilderTest.java
License:Apache License
@Test public void graphURIFromEveryQuadAddedToDocument() { ArrayList<Quad> quads = new ArrayList<Quad>(); int NUMBER_OF_QUADS = 10; for (int i = 0; i < NUMBER_OF_QUADS; i++) { quads.add(new Quad(Node.createURI(GRAPH_URI + i), Node.createURI(SUBJECT_URI), Node.createURI(PREDICATE_BASE), Node.createLiteral(OBJECT_BASE))); }/*w ww . j a v a2 s . c o m*/ SolrInputDocument doc = quadsToDoc.getDocument(DOCUMENT_KEY, quads); assertNotNull(doc); Collection<Object> graphUriValues = doc.getFieldValues(com.talis.rdf.solr.FieldNames.GRAPH_URI); assertNotNull(graphUriValues); assertEquals(NUMBER_OF_QUADS, graphUriValues.size()); for (int i = 0; i < NUMBER_OF_QUADS; i++) { assertTrue(graphUriValues.contains(GRAPH_URI + i)); } }
From source file:com.talis.rdf.solr.DefaultDocumentBuilderTest.java
License:Apache License
@Test public void duplicateGraphURIsAreOnlyAddedToDocumentOnce() { ArrayList<Quad> quads = new ArrayList<Quad>(); int NUMBER_OF_QUADS = 10; for (int i = 0; i < NUMBER_OF_QUADS; i++) { quads.add(new Quad(Node.createURI(GRAPH_URI), Node.createURI(SUBJECT_URI), Node.createURI(PREDICATE_BASE), Node.createLiteral(OBJECT_BASE + i))); }/*from w w w. ja v a2 s . c om*/ SolrInputDocument doc = quadsToDoc.getDocument(DOCUMENT_KEY, quads); assertNotNull(doc); Collection<Object> graphUriValues = doc.getFieldValues(com.talis.rdf.solr.FieldNames.GRAPH_URI); assertEquals(1, graphUriValues.size()); }
From source file:de.hebis.it.hds.gnd.in.MarcXmlParser.java
License:Open Source License
/** * @param doc/*from ww w . j av a 2s . c o m*/ */ private boolean checkAndLog(SolrInputDocument doc, String marcXml) { if (LOG.isTraceEnabled()) LOG.trace("New Document: " + doc.toString()); String docId = (String) doc.getFieldValue("id"); if (docId == null) { LOG.error("No Id found in " + marcXml.replace('\n', ' ')); return false; } if (doc.getFieldValue("preferred") == null) { LOG.error(docId + ": No preferred naming found in marcXml. " + marcXml.replace('\n', ' ')); return false; } if (LOG.isDebugEnabled()) { if (doc.getFieldValues("coordinates") != null) { for (Object coordinate : doc.getFieldValues("coordinates")) { LOG.debug(docId + ": Coordinates found [" + (String) coordinate + "]."); } } if (doc.getFieldValue("look4me") == "true") { LOG.debug(docId + ":(" + (String) doc.getFieldValue("preferred") + ") Needs a second pass."); } } int counterNow = counter.getAndIncrement(); if (counterNow % 10000 == 0) LOG.info("Records processed: " + counterNow); return true; }
From source file:eu.annocultor.converters.europeana.RecordCompletenessRanking.java
License:Apache License
/** * Checking completeness at ingestion to store in solr index. */// w ww.j a v a 2 s . c o m public static int rankRecordCompleteness(SolrInputDocument document) { List<String> tags = new ArrayList<String>(); tags.add(objectAsString(document.getFieldValue("dc_coverage"))); tags.add(objectAsString(document.getFieldValue("dc_contributor"))); // tags.add(objectAsString(document.getFieldValue("dc_description"))); tags.add(objectAsString(document.getFieldValue("dc_creator"))); tags.add(objectAsString(document.getFieldValue("dc_date"))); tags.add(objectAsString(document.getFieldValue("dc_format"))); tags.add(objectAsString(document.getFieldValue("dc_identifier"))); tags.add(objectAsString(document.getFieldValue("dc_language"))); tags.add(objectAsString(document.getFieldValue("dc_publisher"))); tags.add(objectAsString(document.getFieldValue("dc_relation"))); tags.add(objectAsString(document.getFieldValue("dc_rights"))); tags.add(objectAsString(document.getFieldValue("dc_source"))); tags.add(objectAsString(document.getFieldValue("dc_subject"))); // tags.add(objectAsString(document.getFieldValue("dc_title"))); // tags.add(objectAsString(document.getFieldValue("dc_type"))); tags.add(objectAsString(document.getFieldValue("dcterms_alternative"))); tags.add(objectAsString(document.getFieldValue("dcterms_created"))); tags.add(objectAsString(document.getFieldValue("dcterms_conformsTo"))); tags.add(objectAsString(document.getFieldValue("dcterms_extent"))); tags.add(objectAsString(document.getFieldValue("dcterms_hasFormat"))); tags.add(objectAsString(document.getFieldValue("dcterms_hasPart"))); tags.add(objectAsString(document.getFieldValue("dcterms_hasVersion"))); tags.add(objectAsString(document.getFieldValue("dcterms_isFormatOf"))); tags.add(objectAsString(document.getFieldValue("dcterms_isPartOf"))); tags.add(objectAsString(document.getFieldValue("dcterms_isReferencedBy"))); tags.add(objectAsString(document.getFieldValue("dcterms_isReplacedBy"))); tags.add(objectAsString(document.getFieldValue("dcterms_isRequiredBy"))); tags.add(objectAsString(document.getFieldValue("dcterms_issued"))); tags.add(objectAsString(document.getFieldValue("dcterms_isVersionOf"))); tags.add(objectAsString(document.getFieldValue("dcterms_medium"))); tags.add(objectAsString(document.getFieldValue("dcterms_provenance"))); tags.add(objectAsString(document.getFieldValue("dcterms_references"))); tags.add(objectAsString(document.getFieldValue("dcterms_replaces"))); tags.add(objectAsString(document.getFieldValue("dcterms_requires"))); tags.add(objectAsString(document.getFieldValue("dcterms_spatial"))); tags.add(objectAsString(document.getFieldValue("dcterms_tableOfContents"))); tags.add(objectAsString(document.getFieldValue("dcterms_temporal"))); String thumbnailUrl = objectAsString(document.getFieldValue("europeana_object")); String title = objectAsString(document.getFieldValue("dc_title")); String description = objectAsString(StringUtils.join(document.getFieldValues("dc_description"), ".")); return rankRecordCompleteness(thumbnailUrl, title, description, tags); }
From source file:eu.annocultor.converters.solr.SolrDocumentTagger.java
License:Apache License
static int countWords(SolrInputDocument document, String fieldNamePrefix) { int count = 0; for (String fieldName : document.getFieldNames()) { if (fieldName.startsWith(fieldNamePrefix)) { for (Object fieldValue : document.getFieldValues(fieldName)) { if (fieldValue != null) { String delims = "[\\W]+"; String[] words = fieldValue.toString().split(delims); for (String word : words) { if (!StringUtils.isBlank(word) && word.length() >= MIN_WORD_LENGTH_TO_INCLUDE_IN_WORD_COUNT) { count++;//w w w .j a va2 s. co m } } } } } } return count; }
From source file:eu.annocultor.converters.solr.SolrTagger.java
License:Apache License
void tag(SolrInputDocument document) throws Exception, ParseException { broaderTerms = new ArrayList<Term>(); broaderLabels = new HashSet<String>(); beforeDocument(document);/*from w ww . j a v a2 s . com*/ Set<String> codes = new HashSet<String>(); Set<String> labels = new HashSet<String>(); for (FieldRulePair frp : fieldRulePairs) { Collection<Object> values = document.getFieldValues(frp.getField()); if (values != null) { for (Object valueObject : values) { if (valueObject != null) { String value = valueObject.toString(); if (!StringUtils.isBlank(value)) { Triple triple = new Triple("http://xxx", null, new LiteralValue(value), null); frp.getRule().fire(triple, null); TermList terms = frp.getRule().getLastMatch(); if (terms != null) { for (Term term : terms) { codes.add(term.getCode()); TermList altTerms = vocabulary.findByCode(new CodeURI(term.getCode())); for (Term altTerm : altTerms) { if (shouldInclude(altTerm)) { labels.add(altTerm.getLabel()); } } afterTermMatched(term); broaderTerms.addAll(parentTermReconstructor.allParents(term)); } } } } } } } for (String code : codes) { document.addField(termFieldName, code); } for (String label : labels) { document.addField(labelFieldName, label); } computeBroaderLabels(); afterDocument(document); addBroaderTermsAndLabels(document); }
From source file:eu.clarin.cmdi.vlo.importer.MetadataImporter.java
/** * Adds two fields FIELD_FORMAT and FIELD_RESOURCE. The Type can be * specified in the "ResourceType" element of an imdi file or possibly * overwritten by some more specific xpath (as in the LRT cmdi files). So if * a type is overwritten and already in the solrDocument we take that type. * * @param solrDocument/*from w ww.j a v a 2s . c o m*/ * @param cmdiData */ protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) { List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_FORMAT) ? new ArrayList<>(solrDocument.getFieldValues(FacetConstants.FIELD_FORMAT)) : null; solrDocument.removeField(FacetConstants.FIELD_FORMAT); //Remove old values they might be overwritten. List<Resource> resources = cmdiData.getDataResources(); for (int i = 0; i < resources.size(); i++) { Resource resource = resources.get(i); String mimeType = resource.getMimeType(); if (mimeType == null) { if (fieldValues != null && i < fieldValues.size()) { mimeType = CommonUtils.normalizeMimeType(fieldValues.get(i).toString()); } else { mimeType = CommonUtils.normalizeMimeType(""); } } FormatPostProcessor processor = new FormatPostProcessor(); mimeType = processor.process(mimeType).get(0); // TODO check should probably be moved into Solr (by using some minimum length filter) if (!mimeType.equals("")) { solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType); } solrDocument.addField(FacetConstants.FIELD_RESOURCE, mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR + resource.getResourceName()); } solrDocument.addField(FacetConstants.FIELD_RESOURCE_COUNT, resources.size()); }