Example usage for org.apache.lucene.document Document getFields

List of usage examples for org.apache.lucene.document Document getFields

Introduction

In this page you can find the example usage for org.apache.lucene.document Document getFields.

Prototype

public IndexableField[] getFields(String name) 

Source Link

Document

Returns an array of IndexableField s with the given name.

Usage

From source file:SimpleNaiveBayesDocumentClassifier.java

License:Apache License

/**
 * This methods performs the analysis for the seed document and extract the boosts if present.
 * This is done only one time for the Seed Document.
 *
 * @param inputDocument         the seed unseen document
 * @param fieldName2tokensArray a map that associated to a field name the list of token arrays for all its values
 * @param fieldName2boost       a map that associates the boost to the field
 * @throws IOException If there is a low-level I/O error
 *//*from ww  w  .  j  a  va 2s. c  om*/
private void analyzeSeedDocument(Document inputDocument, Map<String, List<String[]>> fieldName2tokensArray,
        Map<String, Float> fieldName2boost) throws IOException {
    for (int i = 0; i < textFieldNames.length; i++) {
        String fieldName = textFieldNames[i];
        float boost = 1;
        List<String[]> tokenizedValues = new LinkedList<>();
        if (fieldName.contains("^")) {
            String[] field2boost = fieldName.split("\\^");
            fieldName = field2boost[0];
            boost = Float.parseFloat(field2boost[1]);
        }
        IndexableField[] fieldValues = inputDocument.getFields(fieldName);
        for (IndexableField fieldValue : fieldValues) {
            TokenStream fieldTokens = fieldValue.tokenStream(field2analyzer.get(fieldName), null);
            String[] fieldTokensArray = getTokenArray(fieldTokens);
            tokenizedValues.add(fieldTokensArray);
        }
        fieldName2tokensArray.put(fieldName, tokenizedValues);
        fieldName2boost.put(fieldName, boost);
        textFieldNames[i] = fieldName;
    }
}

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 *///from  ww  w. j a v a  2  s.  co m
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Terms vector = ir.getTermVector(docNum, fieldName);
        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField[] fields = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector);
        }
    }

    return createQueue(termFreqMap);
}

From source file:axiom.objectmodel.dom.LuceneManager.java

License:Open Source License

public static HashMap luceneDocumentToMap(Document doc) {
    HashMap map = new HashMap();

    Enumeration e = doc.fields();
    while (e.hasMoreElements()) {
        Field f = (Field) e.nextElement();
        String fieldname = f.name();
        if (LuceneManager.isSearchOnlyField(fieldname)) {
            map.put(fieldname, undoFields(doc.getFields(fieldname)));
        } else {//from w ww. java 2  s .c  om
            map.put(fieldname, f.stringValue());
        }
    }

    return map;
}

From source file:axiom.objectmodel.dom.LuceneManager.java

License:Open Source License

public Key[] getTargetNodeIds(final String id, final int mode, ArrayList protos, BooleanQuery append, Sort sort)
        throws Exception {
    IndexSearcher searcher = null;//  ww  w . ja va  2  s.co  m
    Document doc = null;
    BooleanQuery query = null;

    try {
        searcher = this.getIndexSearcher();
        String idvalue = id;
        Query id_query = new TermQuery(new Term(ID, idvalue));

        for (int i = mode; i >= LIVE_MODE; i--) {
            if (i != LIVE_MODE && !isDraftNode(id, mode)) {
                continue;
            }
            query = new BooleanQuery();
            query.add(id_query, BooleanClause.Occur.MUST);
            query.add(new TermQuery(new Term(LAYER_OF_SAVE, i + "")), BooleanClause.Occur.MUST);
            Hits hits = searcher.search(query);

            /*if (app.debug()) 
               app.logEvent("LuceneManager.getTargetNodeIds(): id=" + id 
                + ",layer=" + mode + " executed query [" + query 
                + "] which resulted in " + hits.length() + " hits");*/

            if (hits.length() > 0) {
                doc = hits.doc(0);
                break;
            }
        }
    } catch (Exception ex) {
        app.logError(
                ErrorReporter.errorMsg(this.getClass(), "getSourceReferences") + "Could not retrieve document "
                        + id + " from Lucene index with query = " + (query != null ? query : "null"),
                ex);
        throw ex;
    } finally {
        this.releaseIndexSearcher(searcher);
    }

    if (doc == null) {
        return new Key[0];
    }

    Field[] fields = doc.getFields(REF_LIST_FIELD);
    int len;
    if ((fields == null) || ((len = fields.length) == 0)) {
        return new Key[0];
    }

    ArrayList<Key> keys = new ArrayList<Key>();
    doc = null;
    for (int i = 0; i < len; i++) {
        doc = null;
        query = new BooleanQuery();
        String refid = getIdFromRefListField(fields[i]);
        query.add(new TermQuery(new Term(ID, refid)), BooleanClause.Occur.MUST);
        BooleanQuery proto_query = null;
        final int sizeOfProtos;
        if ((sizeOfProtos = protos.size()) > 0) {
            proto_query = new BooleanQuery();
            for (int j = 0; j < sizeOfProtos; j++) {
                proto_query.add(new TermQuery(new Term(PROTOTYPE, (String) protos.get(j))),
                        BooleanClause.Occur.SHOULD);
            }
            query.add(proto_query, BooleanClause.Occur.MUST);
        }
        if (append != null && append.getClauses().length > 0) {
            query.add(append, BooleanClause.Occur.MUST);
        }
        for (int j = mode; j >= LIVE_MODE; j--) {
            if (j != LIVE_MODE && !isDraftNode(refid, mode)) {
                continue;
            }
            query.add(new TermQuery(new Term(LAYER_OF_SAVE, j + "")), BooleanClause.Occur.MUST);
            Hits hits = searcher.search(query);

            /*if (app.debug()) 
               app.logEvent("LuceneManager.getTargetNodeIds() [for retrieving target " +
                "keys]: id=" + id + ",layer=" + mode + " executed query [" + query 
                + "] which resulted in " + hits.length() + " hits");*/

            if (hits.length() > 0) {
                doc = hits.doc(0);
                break;
            }
        }
        if (doc != null) {
            keys.add(new DbKey(this.app.getDbMapping(doc.get(PROTOTYPE)), doc.get(ID), mode));
        }
    }

    Key[] key_arr = new Key[keys.size()];
    return keys.toArray(key_arr);
}

From source file:axiom.objectmodel.dom.LuceneManager.java

License:Open Source License

public static boolean isIdInDocumentRefs(Document doc, String id) {
    Field[] ref_fields = doc.getFields(REF_LIST_FIELD);
    final int ref_length = ref_fields != null ? ref_fields.length : 0;
    for (int i = 0; i < ref_length; i++) {
        String[] values = ref_fields[i].stringValue().split(NULL_DELIM);
        if (id.equals(values[0])) {
            return true;
        }/*w  w w .ja v a2 s  .c o  m*/
    }
    return false;
}

From source file:axiom.scripting.rhino.LuceneQueryDispatcher.java

License:Open Source License

private void luceneResultsToReferences(final Hits hits, final ArrayList results, final HashSet targets,
        final int mode) throws Exception {
    final int hitslen = hits.length();
    final String ID = LuceneManager.ID;
    final String REF_FIELD = LuceneManager.REF_LIST_FIELD;
    final String DELIM = LuceneManager.NULL_DELIM;
    final Context cx = Context.getCurrentContext();
    final GlobalObject global = this.core != null ? this.core.global : null;
    if (global == null) {
        return;/*from  w  ww.ja v  a 2  s.co  m*/
    }

    for (int i = 0; i < hitslen; i++) {
        Document d = hits.doc(i);
        Field f = d.getField(ID);
        if (f == null) {
            continue;
        }

        final String source_id = f.stringValue();

        Field[] ref_fields = d.getFields(REF_FIELD);
        int ref_length = ref_fields != null ? ref_fields.length : 0;
        for (int j = 0; j < ref_length; j++) {
            String[] values = ref_fields[j].stringValue().split(DELIM);
            if (targets.contains(values[0])) {
                final Object[] args = { new DbKey(null, values[0], mode) };
                Object o = cx.newObject(global, "Reference", args);
                Reference relobj = (Reference) o;
                relobj.setSourceKey(new DbKey(null, source_id, mode));
                relobj.setSourceProperty(values[1]);
                if (values.length > 2) {
                    relobj.setSourceIndex(Integer.parseInt(values[2]));
                    if (values.length > 3) {
                        relobj.setSourceXPath(values[3]);
                    }
                }

                results.add(relobj);
            }
        }
    }
}

From source file:com.b2international.index.lucene.IndexFieldBase.java

License:Apache License

public final IndexableField[] getFields(Document doc) {
    return doc.getFields(fieldName());
}

From source file:com.core.nlp.query.MoreLikeThis.java

License:Apache License

/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 *//*from   w  w  w  .  java  2s  .c  o  m*/
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField[] fields = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector);
        }
    }

    return createQueue(termFreqMap);
}

From source file:com.esri.gpt.catalog.lucene.LuceneIndexAdapter.java

License:Apache License

/**
 * Queries the ACL values indexed for a document.
 * @param uuid the document UUID/*from  ww  w.  j  a va 2 s . c  o  m*/
 * @return the ACL values (can be null)
 * @throws CatalogIndexException if an exception occurs
 */
@Override
public String[] queryAcls(String uuid) throws CatalogIndexException {
    ArrayList<String> values = new ArrayList<String>();
    IndexSearcher searcher = null;
    TermDocs termDocs = null;
    try {
        uuid = Val.chkStr(uuid);
        if (uuid.length() > 0) {
            searcher = newSearcher();
            String[] aFields = new String[] { Storeables.FIELD_ACL };
            MapFieldSelector selector = new MapFieldSelector(aFields);
            searcher = newSearcher();
            IndexReader reader = searcher.getIndexReader();
            termDocs = reader.termDocs();
            termDocs.seek(new Term(Storeables.FIELD_UUID, uuid));
            if (termDocs.next()) {
                Document document = reader.document(termDocs.doc(), selector);
                Field[] fields = document.getFields(Storeables.FIELD_ACL);
                if ((fields != null) && (fields.length > 0)) {
                    for (Field field : fields) {
                        values.add(field.stringValue());
                    }
                }
            }
        }
    } catch (IOException e) {
        String sMsg = "Error accessing index:\n " + Val.chkStr(e.getMessage());
        throw new CatalogIndexException(sMsg, e);
    } finally {
        try {
            if (termDocs != null)
                termDocs.close();
        } catch (Exception ef) {
        }
        closeSearcher(searcher);
    }
    return values.toArray(new String[0]);
}

From source file:com.esri.gpt.catalog.lucene.LuceneIndexSynchronizer.java

License:Apache License

/**
 * Walks the documents within the database modifying the index as required.
 * @param info synchronization step information
 * @throws IOException if an exception occurs while communicating with the index
 * @throws SQLException if an exception occurs while communicating with the database
 * @throws CatalogIndexException if an exception occurs while modifying the index
 *//*from   w w w  . j ava 2 s  .com*/
private void walkDatabase(WalkDatabaseInfo info) throws IOException, SQLException, CatalogIndexException {
    LOGGER.fine("Checking database records...");
    PreparedStatement st = null;
    PreparedStatement stCol = null;
    TermDocs termDocs = null;
    try {

        // determine the metadata acl policy
        MetadataAcl acl = new MetadataAcl(this.context);
        boolean bCheckAcl = !acl.isPolicyUnrestricted();

        // determine if collections are being used
        List<String[]> collections = null;
        CollectionDao colDao = new CollectionDao(this.context);
        boolean hasCollections = false;
        boolean useCollections = colDao.getUseCollections();
        String sColMemberTable = colDao.getCollectionMemberTableName();
        String sqlCol = "SELECT COLUUID FROM " + sColMemberTable + " WHERE DOCUUID=?";
        if (useCollections) {
            collections = colDao.queryCollections();
            hasCollections = (collections.size() > 0);
        }

        // initialize index related variables
        boolean bCheckIndex = (info.numOriginallyIndexed > 0);
        String fldUuid = Storeables.FIELD_UUID;
        String fldModified = Storeables.FIELD_DATEMODIFIED;
        String fldAcl = Storeables.FIELD_ACL;

        ArrayList<String> alFields = new ArrayList<String>();
        alFields.add(fldModified);
        if (bCheckAcl)
            alFields.add(fldAcl);
        if (useCollections)
            alFields.add("isPartOf");
        FieldSelector selector = new MapFieldSelector(alFields.toArray(new String[0]));

        Term termUuid = new Term(fldUuid);
        if (bCheckIndex) {
            termDocs = this.reader.termDocs();
        }
        StringSet delUuids = new StringSet();

        // build the database query
        StringBuffer sb = new StringBuffer("SELECT");
        sb.append(" ").append(this.resourceTable).append(".DOCUUID");
        sb.append(",").append(this.resourceTable).append(".APPROVALSTATUS");
        sb.append(",").append(this.resourceTable).append(".PROTOCOL_TYPE");
        sb.append(",").append(this.resourceTable).append(".FINDABLE");
        sb.append(",").append(this.resourceTable).append(".UPDATEDATE");
        sb.append(",").append(this.resourceTable).append(".ACL");
        sb.append(" FROM ").append(this.resourceTable);
        String sql = sb.toString();
        LOGGER.finest(sql);

        // execute the query, walk through the database records
        Connection con = this.context.getConnectionBroker().returnConnection("").getJdbcConnection();
        st = con.prepareStatement(sql);
        ResultSet rs = st.executeQuery();
        if (this.checkInterrupted())
            return;
        if (useCollections && hasCollections) {
            stCol = con.prepareStatement(sqlCol);
        }

        while (rs.next()) {

            info.numProcessed++;
            info.loopCount++;
            long nDbTimeModified = 0;
            Timestamp tsDbModified = null;
            String sDbAcl = null;
            boolean bIndexable = false;

            // read the database uuid and approval status
            String uuid = rs.getString(1);
            String status = rs.getString(2);
            String protocolType = Val.chkStr(rs.getString(3));
            boolean findable = Val.chkBool(rs.getString(4), false);

            bIndexable = (status != null)
                    && (status.equalsIgnoreCase("approved") || status.equalsIgnoreCase("reviewed"));
            if (bIndexable && protocolType.length() > 0 && !findable) {
                bIndexable = false;
            }

            // read the database modification date
            if (bIndexable) {
                tsDbModified = rs.getTimestamp(5);
                if (tsDbModified != null) {
                    nDbTimeModified = tsDbModified.getTime();
                }
                bIndexable = (nDbTimeModified > 0);
            }

            // for non-indexable documents, delete
            if (!bIndexable) {
                info.numNonIndexable++;
                if (bCheckIndex) {
                    termDocs.seek(termUuid.createTerm(uuid));
                    if (termDocs.next()) {
                        info.numNonIndexableFound++;
                        info.numRequiringDelete++;
                        delUuids.add(uuid);
                        if (delUuids.size() >= this.maxDeleteTokens) {
                            if (this.checkInterrupted())
                                return;
                            this.deleteDocuments(delUuids);
                            info.numDocsDeleted += delUuids.size();
                            delUuids.clear();
                            if (this.checkInterrupted())
                                return;
                        }
                    }
                }
            }

            // for indexable documents, check to ensure that they are in sync
            if (bIndexable) {
                info.numIndexable++;
                boolean bRequiresUpdate = true;

                // find the document within the index
                if (bCheckIndex) {
                    termDocs.seek(termUuid.createTerm(uuid));
                    if (termDocs.next()) {
                        info.numIndexableFound++;
                        Document doc = this.reader.document(termDocs.doc(), selector);
                        if (doc != null) {
                            bRequiresUpdate = false;

                            // check the modification date
                            long nIdxTimeModified = 0;
                            String sModified = doc.get(fldModified);
                            if (sModified != null) {
                                try {
                                    nIdxTimeModified = Long.valueOf(sModified);
                                } catch (NumberFormatException e) {
                                    nIdxTimeModified = 0;
                                }
                            }
                            bRequiresUpdate = (nIdxTimeModified == 0) || (nDbTimeModified > nIdxTimeModified);
                            if (bRequiresUpdate)
                                info.numWithInconsistentDates++;

                            // check the acl
                            if (!bRequiresUpdate && bCheckAcl) {
                                long tAclStartMillis = System.currentTimeMillis();
                                bRequiresUpdate = true;
                                String[] aclsDb = null;
                                sDbAcl = rs.getString(6);
                                try {
                                    // use an internal method for quick parsing
                                    //aclsDb = acl.makeDocumentAcl(sDbAcl);
                                    aclsDb = this.parseAcl(sDbAcl);
                                } catch (Exception eacl) {
                                    String sMsg = "Error parsing acl";
                                    sMsg += ", uuid=" + uuid + "\n" + Val.chkStr(eacl.getMessage());
                                    LOGGER.log(Level.WARNING, sMsg, eacl);
                                }

                                if (aclsDb == null)
                                    aclsDb = new String[0];
                                ArrayList<String> aclsIdx = new ArrayList<String>();
                                Field[] aclFields = doc.getFields(fldAcl);
                                if ((aclFields != null) && (aclFields.length > 0)) {
                                    for (Field aclField : aclFields) {
                                        aclsIdx.add(aclField.stringValue());
                                    }
                                }
                                if (aclsDb.length == aclsIdx.size()) {
                                    int nMatched = 0;
                                    if (aclsDb.length > 0) {
                                        for (String s1 : aclsDb) {
                                            for (String s2 : aclsIdx) {
                                                if (s1.equalsIgnoreCase(s2)) {
                                                    nMatched++;
                                                    break;
                                                }
                                            }
                                        }
                                    }
                                    bRequiresUpdate = (nMatched != aclsDb.length);
                                }
                                if (bRequiresUpdate)
                                    info.numWithInconsistentAcls++;
                                info.aclMillis += (System.currentTimeMillis() - tAclStartMillis);
                            }

                            // check collection membership
                            if (!bRequiresUpdate && useCollections) {
                                long tColStartMillis = System.currentTimeMillis();
                                bRequiresUpdate = true;

                                ArrayList<String> colDb = new ArrayList<String>();
                                if (useCollections && hasCollections) {
                                    stCol.clearParameters();
                                    stCol.setString(1, uuid);
                                    ResultSet rsCol = stCol.executeQuery();
                                    while (rsCol.next()) {
                                        String sCUuid = rsCol.getString(1);
                                        for (String[] col : collections) {
                                            if (sCUuid.equals(col[0])) {
                                                colDb.add(col[1]);
                                                break;
                                            }
                                        }
                                    }
                                    rsCol.close();
                                }

                                ArrayList<String> colIdx = new ArrayList<String>();
                                Field[] colFields = doc.getFields("isPartOf");
                                if ((colFields != null) && (colFields.length > 0)) {
                                    for (Field colField : colFields) {
                                        colIdx.add(colField.stringValue());
                                    }
                                }
                                if (colDb.size() == colIdx.size()) {
                                    int nMatched = 0;
                                    if (colDb.size() > 0) {
                                        for (String s1 : colDb) {
                                            for (String s2 : colIdx) {
                                                if (s1.equalsIgnoreCase(s2)) {
                                                    nMatched++;
                                                    break;
                                                }
                                            }
                                        }
                                    }
                                    bRequiresUpdate = (nMatched != colDb.size());
                                }
                                if (bRequiresUpdate)
                                    info.numWithInconsistentColMembership++;
                                info.colMillis += (System.currentTimeMillis() - tColStartMillis);
                            }

                        }
                    }
                }

                // execute the update if required
                if (bRequiresUpdate) {
                    if (this.checkInterrupted())
                        return;
                    try {
                        if (bCheckAcl) {
                            if (sDbAcl == null)
                                sDbAcl = rs.getString(6);
                        }
                        String sXml = Val.chkStr(this.readXml(uuid));
                        if (sXml.length() > 0) {
                            info.numRequiringUpdate++;
                            MetadataDocument mdDoc = new MetadataDocument();
                            Schema schema = mdDoc.prepareForView(this.context, sXml);
                            this.adapter.publishDocument(uuid, tsDbModified, schema, sDbAcl);
                            info.numDocsUpdated++;
                        }
                    } catch (SchemaException se) {

                        // dont' allow the entire process to fail over one bad xml
                        String sMsg = "Error indexing document during synchronization";
                        sMsg += ", uuid=" + uuid + "\n" + Val.chkStr(se.getMessage());
                        LOGGER.log(Level.WARNING, sMsg, se);
                    }
                    if (this.checkInterrupted())
                        return;
                }

            }

            // cache the synchronized uuids
            if (this.synchedUuidCache != null) {
                this.synchedUuidCache.put(uuid, "");
                if (this.synchedUuidCache.size() > this.maxUuidCache) {
                    this.synchedUuidCache = null;
                }
            }

            // log a status message if the feedback threshold was reached
            if (this.checkInterrupted())
                return;
            if ((System.currentTimeMillis() - info.loopStartMillis) >= this.feedbackMillis) {
                LOGGER.info(info.getLoopMessage());
            }

        }

        // delete any documents left over in the buffer
        if (delUuids.size() >= 0) {
            if (this.checkInterrupted())
                return;
            this.deleteDocuments(delUuids);
            info.numDocsDeleted += delUuids.size();
        }

        LOGGER.info(info.getStepMessage());
    } finally {
        try {
            if (st != null)
                st.close();
        } catch (Exception ef) {
        }
        try {
            if (stCol != null)
                stCol.close();
        } catch (Exception ef) {
        }
        try {
            if (termDocs != null)
                termDocs.close();
        } catch (Exception ef) {
        }
    }
}