Example usage for org.apache.solr.common SolrInputDocument getFieldValues

List of usage examples for org.apache.solr.common SolrInputDocument getFieldValues

Introduction

In this page you can find the example usage for org.apache.solr.common SolrInputDocument getFieldValues.

Prototype

@Override
public Collection<Object> getFieldValues(String name) 

Source Link

Document

Get all the values for a field.

Usage

From source file:com.francelabs.datafari.updateprocessor.DatafariUpdateProcessor.java

License:Apache License

@Override
public void processAdd(final AddUpdateCommand cmd) throws IOException {
    final SolrInputDocument doc = cmd.getSolrInputDocument();

    // Sometimes Tika put several ids so we keep the first one which is
    // always the right one
    if (doc.getFieldValues("id").size() > 1) {
        final Object id = doc.getFieldValue("id");
        doc.remove("id");
        doc.addField("id", id);
    }//from   w w w  .j  a  v a  2s.com

    // Try to retrieve at the ignored_filelastmodified field to set it's
    // value in the last_modified field
    if (doc.getFieldValue("ignored_filelastmodified") != null) {
        final Object last_modified = doc.getFieldValue("ignored_filelastmodified");
        doc.remove("last_modified");
        doc.addField("last_modified", last_modified);
    }

    // Sometimes Tika put several last_modified dates, so we keep the first
    // one which is always the right one
    if ((doc.getFieldValues("last_modified") != null) && (doc.getFieldValues("last_modified").size() > 1)) {
        final Object last_modified = doc.getFieldValue("last_modified");
        doc.remove("last_modified");
        doc.addField("last_modified", last_modified);
    }

    final String url = (String) doc.getFieldValue("id");

    // Create path hierarchy for facet
    final List<String> urlHierarchy = new ArrayList<>();

    /*
     * // Create path hierarchy for facet
     *
     * final List<String> urlHierarchy = new ArrayList<String>();
     *
     * final String path = url.replace("file:", ""); int previousIndex = 1; int
     * depth = 0; // Tokenize the path and add the depth as first character for
     * each token // (like: 0/home, 1/home/project ...) for (int i = 0; i <
     * path.split("/").length - 2; i++) { int endIndex = path.indexOf('/',
     * previousIndex); if (endIndex == -1) { endIndex = path.length() - 1; }
     * urlHierarchy.add(depth + path.substring(0, endIndex)); depth++;
     * previousIndex = endIndex + 1; }
     *
     * // Add the tokens to the urlHierarchy field doc.addField("urlHierarchy",
     * urlHierarchy);
     */

    doc.addField("url", url);

    String filename = "";
    final SolrInputField streamNameField = doc.get("ignored_stream_name");
    if (streamNameField != null) {
        filename = (String) streamNameField.getFirstValue();
    } else {
        final Pattern pattern = Pattern.compile("[^/]*$");
        final Matcher matcher = pattern.matcher(url);
        if (matcher.find()) {
            filename = matcher.group();
        }
    }

    if (url.startsWith("http")) {
        if (doc.get("title") == null) {
            doc.addField("title", filename);
        }
        doc.addField("source", "web");
    }

    if (url.startsWith("file")) {
        doc.removeField("title");
        doc.addField("title", filename);
        doc.addField("source", "file");
    }

    String extension = "";
    URL urlObject = new URL(url);
    String path = urlObject.getPath();
    final SolrInputField mimeTypeField = doc.get("ignored_content_type");

    String nameExtension = FilenameUtils.getExtension(path);
    String tikaExtension = mimeTypeField == null ? "" : extensionFromMimeTypeField(mimeTypeField);

    if (extensionFromName) {
        extension = nameExtension.length() > 1 && nameExtension.length() < 5 ? nameExtension : tikaExtension;
    } else {
        extension = tikaExtension.length() > 1 && tikaExtension.length() < 5 ? tikaExtension : nameExtension;
    }
    /*
    if (extensionFromName || mimeTypeField == null) {
       if (path.contains(".")){
         extension = FilenameUtils.getExtension(path);
          if (extension.length() > 4 || extension.length() < 1) {
    // If length is too long, try extracting from tika information if available
    String tryExtension = mimeTypeField==null ? null : extensionFromMimeTypeField(mimeTypeField);
    if (tryExtension != null) {
      extension = tryExtension;
    } else {
      // Else default to bin for anything else
      extension = "bin";
    }
          }
       }
       else if (urlObject.getProtocol().equals("http") || urlObject.getProtocol().equals("https")) {
         extension = null;
         if (mimeTypeField != null) {
           extension = extensionFromMimeTypeField(mimeTypeField);
         } 
         if (extension == null) {
           extension = "html";
         }
       }
    } else {
      extension = extensionFromMimeTypeField(mimeTypeField);
      if (extension == null) {
        extension = FilenameUtils.getExtension(path);
      }
    }
    */
    doc.addField("extension", extension.toLowerCase());

    super.processAdd(cmd);
}

From source file:com.ifactory.press.db.solr.processor.FieldMergingProcessor.java

License:Apache License

@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {

    if (sourceAnalyzers != null && destinationField != null) {
        SolrInputDocument doc = cmd.getSolrInputDocument();
        for (Map.Entry<String, PoolingAnalyzerWrapper> entry : sourceAnalyzers.entrySet()) {
            String sourceFieldName = entry.getKey();
            Analyzer fieldAnalyzer = entry.getValue();
            Collection<Object> fieldValues = doc.getFieldValues(sourceFieldName);
            if (fieldValues != null) {
                for (Object value : fieldValues) {
                    IndexableField fieldValue = new TextField(destinationField,
                            fieldAnalyzer.tokenStream(sourceFieldName, value.toString()));
                    doc.addField(destinationField, fieldValue);
                }/*from ww  w.j ava2 s.  c o m*/
            }
        }
    }

    if (next != null)
        next.processAdd(cmd);

    // and then release all the analyzers, readying them for re-use
    for (Map.Entry<String, PoolingAnalyzerWrapper> entry : sourceAnalyzers.entrySet()) {
        entry.getValue().release();
    }
}

From source file:com.plugtree.solrmeter.extractor.FileInputDocumentExtractorTestCase.java

License:Apache License

public void testSingleDoc() throws FileNotFoundException {

    FileInputDocumentExtractorSpy extractor = new FileInputDocumentExtractorSpy(
            FileUtils.findFileAsString("FileInputDocumentExtractorTestCase1.txt"));
    assertEquals(1, extractor.getParsedDocuments().size());

    for (int i = 0; i < 10; i++) {
        SolrInputDocument document = extractor.getRandomDocument();
        //      fieldName1=value1;fieldName2=value2;fieldName3=value3
        Iterator<Object> values = document.getFieldValues("fieldName1").iterator();
        assertEquals("value1", values.next());
        assertEquals("value2", values.next());
        assertEquals("value2", document.getFieldValue("fieldName2"));
        assertEquals("value3", document.getFieldValue("fieldName3"));
    }//from   w  ww . j a va 2s.c  o  m
}

From source file:com.talis.rdf.solr.DefaultDocumentBuilderTest.java

License:Apache License

@Test
public void graphURIFromEveryQuadAddedToDocument() {
    ArrayList<Quad> quads = new ArrayList<Quad>();
    int NUMBER_OF_QUADS = 10;
    for (int i = 0; i < NUMBER_OF_QUADS; i++) {
        quads.add(new Quad(Node.createURI(GRAPH_URI + i), Node.createURI(SUBJECT_URI),
                Node.createURI(PREDICATE_BASE), Node.createLiteral(OBJECT_BASE)));
    }/*w  ww  . j  a v  a2  s .  c o m*/

    SolrInputDocument doc = quadsToDoc.getDocument(DOCUMENT_KEY, quads);
    assertNotNull(doc);
    Collection<Object> graphUriValues = doc.getFieldValues(com.talis.rdf.solr.FieldNames.GRAPH_URI);
    assertNotNull(graphUriValues);
    assertEquals(NUMBER_OF_QUADS, graphUriValues.size());
    for (int i = 0; i < NUMBER_OF_QUADS; i++) {
        assertTrue(graphUriValues.contains(GRAPH_URI + i));
    }
}

From source file:com.talis.rdf.solr.DefaultDocumentBuilderTest.java

License:Apache License

@Test
public void duplicateGraphURIsAreOnlyAddedToDocumentOnce() {
    ArrayList<Quad> quads = new ArrayList<Quad>();
    int NUMBER_OF_QUADS = 10;
    for (int i = 0; i < NUMBER_OF_QUADS; i++) {
        quads.add(new Quad(Node.createURI(GRAPH_URI), Node.createURI(SUBJECT_URI),
                Node.createURI(PREDICATE_BASE), Node.createLiteral(OBJECT_BASE + i)));
    }/*from   w  w w.  ja  v  a2  s . c om*/

    SolrInputDocument doc = quadsToDoc.getDocument(DOCUMENT_KEY, quads);
    assertNotNull(doc);
    Collection<Object> graphUriValues = doc.getFieldValues(com.talis.rdf.solr.FieldNames.GRAPH_URI);
    assertEquals(1, graphUriValues.size());
}

From source file:de.hebis.it.hds.gnd.in.MarcXmlParser.java

License:Open Source License

/**
 * @param doc/*from ww  w . j av a  2s . c o  m*/
 */
private boolean checkAndLog(SolrInputDocument doc, String marcXml) {
    if (LOG.isTraceEnabled())
        LOG.trace("New Document: " + doc.toString());
    String docId = (String) doc.getFieldValue("id");
    if (docId == null) {
        LOG.error("No Id found in " + marcXml.replace('\n', ' '));
        return false;
    }
    if (doc.getFieldValue("preferred") == null) {
        LOG.error(docId + ": No preferred naming found in marcXml. " + marcXml.replace('\n', ' '));
        return false;
    }
    if (LOG.isDebugEnabled()) {
        if (doc.getFieldValues("coordinates") != null) {
            for (Object coordinate : doc.getFieldValues("coordinates")) {
                LOG.debug(docId + ": Coordinates found [" + (String) coordinate + "].");
            }
        }
        if (doc.getFieldValue("look4me") == "true") {
            LOG.debug(docId + ":(" + (String) doc.getFieldValue("preferred") + ") Needs a second pass.");
        }
    }
    int counterNow = counter.getAndIncrement();
    if (counterNow % 10000 == 0)
        LOG.info("Records processed: " + counterNow);
    return true;
}

From source file:eu.annocultor.converters.europeana.RecordCompletenessRanking.java

License:Apache License

/**
 * Checking completeness at ingestion to store in solr index.
 *///  w  ww.j a  v  a  2  s .  c o  m
public static int rankRecordCompleteness(SolrInputDocument document) {

    List<String> tags = new ArrayList<String>();
    tags.add(objectAsString(document.getFieldValue("dc_coverage")));
    tags.add(objectAsString(document.getFieldValue("dc_contributor")));
    //        tags.add(objectAsString(document.getFieldValue("dc_description")));
    tags.add(objectAsString(document.getFieldValue("dc_creator")));
    tags.add(objectAsString(document.getFieldValue("dc_date")));
    tags.add(objectAsString(document.getFieldValue("dc_format")));
    tags.add(objectAsString(document.getFieldValue("dc_identifier")));
    tags.add(objectAsString(document.getFieldValue("dc_language")));
    tags.add(objectAsString(document.getFieldValue("dc_publisher")));
    tags.add(objectAsString(document.getFieldValue("dc_relation")));
    tags.add(objectAsString(document.getFieldValue("dc_rights")));
    tags.add(objectAsString(document.getFieldValue("dc_source")));
    tags.add(objectAsString(document.getFieldValue("dc_subject")));
    //        tags.add(objectAsString(document.getFieldValue("dc_title")));
    //        tags.add(objectAsString(document.getFieldValue("dc_type")));

    tags.add(objectAsString(document.getFieldValue("dcterms_alternative")));
    tags.add(objectAsString(document.getFieldValue("dcterms_created")));
    tags.add(objectAsString(document.getFieldValue("dcterms_conformsTo")));
    tags.add(objectAsString(document.getFieldValue("dcterms_extent")));
    tags.add(objectAsString(document.getFieldValue("dcterms_hasFormat")));
    tags.add(objectAsString(document.getFieldValue("dcterms_hasPart")));
    tags.add(objectAsString(document.getFieldValue("dcterms_hasVersion")));
    tags.add(objectAsString(document.getFieldValue("dcterms_isFormatOf")));
    tags.add(objectAsString(document.getFieldValue("dcterms_isPartOf")));
    tags.add(objectAsString(document.getFieldValue("dcterms_isReferencedBy")));
    tags.add(objectAsString(document.getFieldValue("dcterms_isReplacedBy")));
    tags.add(objectAsString(document.getFieldValue("dcterms_isRequiredBy")));
    tags.add(objectAsString(document.getFieldValue("dcterms_issued")));
    tags.add(objectAsString(document.getFieldValue("dcterms_isVersionOf")));
    tags.add(objectAsString(document.getFieldValue("dcterms_medium")));
    tags.add(objectAsString(document.getFieldValue("dcterms_provenance")));
    tags.add(objectAsString(document.getFieldValue("dcterms_references")));
    tags.add(objectAsString(document.getFieldValue("dcterms_replaces")));
    tags.add(objectAsString(document.getFieldValue("dcterms_requires")));
    tags.add(objectAsString(document.getFieldValue("dcterms_spatial")));
    tags.add(objectAsString(document.getFieldValue("dcterms_tableOfContents")));
    tags.add(objectAsString(document.getFieldValue("dcterms_temporal")));

    String thumbnailUrl = objectAsString(document.getFieldValue("europeana_object"));
    String title = objectAsString(document.getFieldValue("dc_title"));
    String description = objectAsString(StringUtils.join(document.getFieldValues("dc_description"), "."));

    return rankRecordCompleteness(thumbnailUrl, title, description, tags);
}

From source file:eu.annocultor.converters.solr.SolrDocumentTagger.java

License:Apache License

static int countWords(SolrInputDocument document, String fieldNamePrefix) {
    int count = 0;
    for (String fieldName : document.getFieldNames()) {
        if (fieldName.startsWith(fieldNamePrefix)) {
            for (Object fieldValue : document.getFieldValues(fieldName)) {
                if (fieldValue != null) {
                    String delims = "[\\W]+";
                    String[] words = fieldValue.toString().split(delims);
                    for (String word : words) {
                        if (!StringUtils.isBlank(word)
                                && word.length() >= MIN_WORD_LENGTH_TO_INCLUDE_IN_WORD_COUNT) {
                            count++;//w  w  w  .j  a  va2 s.  co m
                        }
                    }
                }
            }
        }
    }
    return count;
}

From source file:eu.annocultor.converters.solr.SolrTagger.java

License:Apache License

void tag(SolrInputDocument document) throws Exception, ParseException {

    broaderTerms = new ArrayList<Term>();
    broaderLabels = new HashSet<String>();
    beforeDocument(document);/*from  w ww . j  a v a2  s  . com*/

    Set<String> codes = new HashSet<String>();
    Set<String> labels = new HashSet<String>();

    for (FieldRulePair frp : fieldRulePairs) {
        Collection<Object> values = document.getFieldValues(frp.getField());

        if (values != null) {
            for (Object valueObject : values) {
                if (valueObject != null) {
                    String value = valueObject.toString();
                    if (!StringUtils.isBlank(value)) {

                        Triple triple = new Triple("http://xxx", null, new LiteralValue(value), null);

                        frp.getRule().fire(triple, null);

                        TermList terms = frp.getRule().getLastMatch();

                        if (terms != null) {
                            for (Term term : terms) {
                                codes.add(term.getCode());
                                TermList altTerms = vocabulary.findByCode(new CodeURI(term.getCode()));
                                for (Term altTerm : altTerms) {
                                    if (shouldInclude(altTerm)) {
                                        labels.add(altTerm.getLabel());
                                    }
                                }
                                afterTermMatched(term);
                                broaderTerms.addAll(parentTermReconstructor.allParents(term));
                            }
                        }
                    }
                }
            }
        }
    }
    for (String code : codes) {
        document.addField(termFieldName, code);
    }
    for (String label : labels) {
        document.addField(labelFieldName, label);
    }

    computeBroaderLabels();
    afterDocument(document);
    addBroaderTermsAndLabels(document);
}

From source file:eu.clarin.cmdi.vlo.importer.MetadataImporter.java

/**
 * Adds two fields FIELD_FORMAT and FIELD_RESOURCE. The Type can be
 * specified in the "ResourceType" element of an imdi file or possibly
 * overwritten by some more specific xpath (as in the LRT cmdi files). So if
 * a type is overwritten and already in the solrDocument we take that type.
 *
 * @param solrDocument/*from  w  ww.j a v a 2s  . c o  m*/
 * @param cmdiData
 */
protected void addResourceData(SolrInputDocument solrDocument, CMDIData cmdiData) {
    List<Object> fieldValues = solrDocument.containsKey(FacetConstants.FIELD_FORMAT)
            ? new ArrayList<>(solrDocument.getFieldValues(FacetConstants.FIELD_FORMAT))
            : null;
    solrDocument.removeField(FacetConstants.FIELD_FORMAT); //Remove old values they might be overwritten.
    List<Resource> resources = cmdiData.getDataResources();
    for (int i = 0; i < resources.size(); i++) {
        Resource resource = resources.get(i);
        String mimeType = resource.getMimeType();
        if (mimeType == null) {
            if (fieldValues != null && i < fieldValues.size()) {
                mimeType = CommonUtils.normalizeMimeType(fieldValues.get(i).toString());
            } else {
                mimeType = CommonUtils.normalizeMimeType("");
            }
        }

        FormatPostProcessor processor = new FormatPostProcessor();
        mimeType = processor.process(mimeType).get(0);

        // TODO check should probably be moved into Solr (by using some minimum length filter)
        if (!mimeType.equals("")) {
            solrDocument.addField(FacetConstants.FIELD_FORMAT, mimeType);
        }
        solrDocument.addField(FacetConstants.FIELD_RESOURCE,
                mimeType + FacetConstants.FIELD_RESOURCE_SPLIT_CHAR + resource.getResourceName());
    }
    solrDocument.addField(FacetConstants.FIELD_RESOURCE_COUNT, resources.size());
}