de.ids_mannheim.korap.search.TestMetaFields.java Source code

Java tutorial

Introduction

Here is the source code for de.ids_mannheim.korap.search.TestMetaFields.java

Source

package de.ids_mannheim.korap.search;

import java.util.*;
import java.io.*;

import static de.ids_mannheim.korap.TestSimple.*;

import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.KrillQuery;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.response.Result;
import java.nio.file.Files;
import java.nio.file.FileSystem;
import java.nio.file.Path;
import java.nio.charset.StandardCharsets;
import java.nio.ByteBuffer;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;

import static org.junit.Assert.*;
import org.junit.Test;
import org.junit.Ignore;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

@RunWith(JUnit4.class)
public class TestMetaFields {

    @Test
    public void searchMetaFields() throws IOException {

        // Construct index
        KrillIndex ki = new KrillIndex();
        // Indexing test files
        for (String i : new String[] { "00001", "00002" }) {
            ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), true);
        }
        ;
        ki.commit();

        String jsonString = getJsonString(getClass().getResource("/queries/metas/fields.jsonld").getFile());

        Krill ks = new Krill(jsonString);

        Result kr = ks.apply(ki);
        assertEquals((long) 17, kr.getTotalResults());
        assertEquals(0, kr.getStartIndex());
        assertEquals(9, kr.getItemsPerPage());

        ObjectMapper mapper = new ObjectMapper();
        JsonNode res = mapper.readTree(kr.toJsonString());

        // System.err.println(res.toString());
        // mirror fields
        assertEquals(9, res.at("/meta/count").asInt());

        if (res.at("/meta/fields/0").asText().equals("UID")) {
            assertEquals("corpusID", res.at("/meta/fields/1").asText());
        } else {
            assertEquals("corpusID", res.at("/meta/fields/0").asText());
            assertEquals("UID", res.at("/meta/fields/1").asText());
        }
        ;

        assertEquals(0, res.at("/matches/0/UID").asInt());
        assertEquals("WPD", res.at("/matches/0/corpusID").asText());
        assertTrue(res.at("/matches/0/docID").isMissingNode());
        assertTrue(res.at("/matches/0/textSigle").isMissingNode());
        assertTrue(res.at("/matches/0/ID").isMissingNode());
        assertTrue(res.at("/matches/0/author").isMissingNode());
        assertTrue(res.at("/matches/0/title").isMissingNode());
        assertTrue(res.at("/matches/0/subTitle").isMissingNode());
        assertTrue(res.at("/matches/0/textClass").isMissingNode());
        assertTrue(res.at("/matches/0/pubPlace").isMissingNode());
        assertTrue(res.at("/matches/0/pubDate").isMissingNode());
        assertTrue(res.at("/matches/0/foundries").isMissingNode());
        assertTrue(res.at("/matches/0/layerInfos").isMissingNode());
        assertTrue(res.at("/matches/0/tokenization").isMissingNode());

        jsonString = getJsonString(getClass().getResource("/queries/metas/fields_2.jsonld").getFile());
        ks = new Krill(jsonString);
        kr = ks.apply(ki);
        assertEquals((long) 17, kr.getTotalResults());
        assertEquals(0, kr.getStartIndex());
        assertEquals(2, kr.getItemsPerPage());

        mapper = new ObjectMapper();
        res = mapper.readTree(kr.toJsonString());
        assertEquals(0, res.at("/matches/0/UID").asInt());
        assertTrue(res.at("/matches/0/corpusID").isMissingNode());
        assertEquals("Ruru,Jens.Ol,Aglarech", res.at("/matches/0/author").asText());
        assertEquals("A", res.at("/matches/0/title").asText());
        assertEquals("WPD_AAA.00001", res.at("/matches/0/docID").asText());
        assertTrue(res.at("/matches/0/textSigle").isMissingNode());
        assertEquals("match-WPD_AAA.00001-p6-7", res.at("/matches/0/matchID").asText());
        // assertEquals("p6-7", res.at("/matches/0/matchID").asText());
        assertEquals("", res.at("/matches/0/subTitle").asText());
        assertEquals("", res.at("/matches/0/textClass").asText());
        assertEquals("", res.at("/matches/0/pubPlace").asText());
        assertEquals("", res.at("/matches/0/pubDate").asText());
        assertEquals("", res.at("/matches/0/foundries").asText());
        assertEquals("", res.at("/matches/0/layerInfo").asText());
        assertEquals("", res.at("/matches/0/tokenization").asText());
    };

    @Test
    public void searchMetaFieldsNew() throws IOException {

        // Construct index
        KrillIndex ki = new KrillIndex();
        ki.addDoc(getClass().getResourceAsStream("/goe/AGX-00002.json"), false);
        ki.commit();

        String jsonString = getJsonString(getClass().getResource("/queries/metas/fields_no.jsonld").getFile());

        Krill ks = new Krill(jsonString);
        Result kr = ks.apply(ki);
        ObjectMapper mapper = new ObjectMapper();
        JsonNode res = mapper.readTree(kr.toJsonString());
        assertEquals(0, res.at("/matches/0/UID").asInt());
        assertEquals("GOE_AGX.00002", res.at("/matches/0/textSigle").asText());
        assertEquals("Maximen und Reflexionen", res.at("/matches/0/title").asText());
        assertEquals("1982", res.at("/matches/0/pubDate").asText());
        assertEquals("Goethe, Johann Wolfgang von", res.at("/matches/0/author").asText());
        assertEquals("GOE_AGX", res.at("/matches/0/docSigle").asText());
        assertEquals("GOE", res.at("/matches/0/corpusSigle").asText());
        assertEquals("Religion und Christentum", res.at("/matches/0/subTitle").asText());
        assertEquals("Mnchen", res.at("/matches/0/pubPlace").asText());
        assertEquals(
                "base/s=spans cnx/c=spans cnx/l=tokens cnx/m=tokens cnx/p=tokens cnx/s=spans cnx/syn=tokens corenlp/c=spans corenlp/ne=tokens corenlp/p=tokens corenlp/s=spans glemm/l=tokens mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans xip/c=spans xip/l=tokens xip/p=tokens xip/s=spans",
                res.at("/matches/0/layerInfos").asText());
        assertTrue(res.at("/matches/0/textType").isMissingNode());
        assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID").asText());

        // All fields
        jsonString = getJsonString(getClass().getResource("/queries/metas/fields_all.jsonld").getFile());

        ks = new Krill(jsonString);
        kr = ks.apply(ki);
        mapper = new ObjectMapper();
        res = mapper.readTree(kr.toJsonString());
        assertEquals("Verlag C. H. Beck", res.at("/matches/0/publisher").asText());
        assertEquals("Aphorismus", res.at("/matches/0/textType").asText());
        assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText());
        assertEquals(
                "Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstverffentlichung: Stuttgart ; Tbingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. Mnchen: Verlag C. H. Beck, 1982, S. 372-377",
                res.at("/matches/0/reference").asText());
        assertEquals("de", res.at("/matches/0/language").asText());
        assertEquals("opennlp#tokens", res.at("/matches/0/tokenSource").asText());
        assertEquals(
                "base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences",
                res.at("/matches/0/foundries").asText());
        assertEquals("Goethe-Korpus", res.at("/matches/0/corpusTitle").asText());
        assertEquals("QAO-NC", res.at("/matches/0/availability").asText());
        assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)", res.at("/matches/0/docTitle").asText());
        assertEquals("1827", res.at("/matches/0/creationDate").asText());
        // assertEquals("372-377", res.at("/matches/0/pages").asText());
        assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID").asText());

        // @All fields
        jsonString = getJsonString(getClass().getResource("/queries/metas/fields_at_all.jsonld").getFile());

        ks = new Krill(jsonString);
        kr = ks.apply(ki);
        mapper = new ObjectMapper();
        res = mapper.readTree(kr.toJsonString());

        assertEquals("Verlag C. H. Beck", res.at("/matches/0/publisher").asText());
        assertEquals("Aphorismus", res.at("/matches/0/textType").asText());
        assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText());
        assertEquals(
                "Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstverffentlichung: Stuttgart ; Tbingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. Mnchen: Verlag C. H. Beck, 1982, S. 372-377",
                res.at("/matches/0/reference").asText());
        assertEquals("de", res.at("/matches/0/language").asText());
        assertEquals("opennlp#tokens", res.at("/matches/0/tokenSource").asText());
        assertEquals(
                "base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences",
                res.at("/matches/0/foundries").asText());
        assertEquals("Goethe-Korpus", res.at("/matches/0/corpusTitle").asText());
        assertEquals("QAO-NC", res.at("/matches/0/license").asText());
        assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)", res.at("/matches/0/docTitle").asText());
        assertEquals("1827", res.at("/matches/0/creationDate").asText());
        // assertEquals("372-377", res.at("/matches/0/pages").asText());
        assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID").asText());
    };

    @Test
    public void searchCollectionFields() throws IOException {
        KrillIndex ki = new KrillIndex();
        FieldDocument fd = new FieldDocument();
        fd.addString("corpusSigle", "ABC");
        fd.addString("docSigle", "ABC-123");
        fd.addString("textSigle", "ABC-123-0001");
        fd.addText("title", "Die Wahlverwandschaften");
        fd.addText("author", "Johann Wolfgang von Goethe");
        fd.addKeyword("textClass", "reisen wissenschaft");
        fd.addInt("pubDate", 20130617);
        fd.addTV("tokens", "abc",
                "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
        ki.addDoc(fd);

        FieldDocument fd2 = new FieldDocument();
        fd2.addString("corpusSigle", "ABC");
        fd2.addString("docSigle", "ABC-125");
        fd2.addString("textSigle", "ABC-125-0001");
        fd2.addText("title", "Die Glocke");
        fd2.addText("author", "Schiller, Friedrich");
        fd2.addKeyword("textClass", "Reisen geschichte");
        fd2.addInt("pubDate", 20130203);
        fd2.addTV("tokens", "abc",
                "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
        ki.addDoc(fd2);
        ki.commit();

        // textClass = reisen & wissenschaft
        String jsonString = getJsonString(
                getClass().getResource("/queries/collections/collection_textClass.jsonld").getFile());
        Krill ks = new Krill(jsonString);
        KrillCollection kc = ks.getCollection();
        kc.setIndex(ki);
        assertEquals(1, kc.numberOf("documents"));

        // textClass = reisen
        jsonString = getJsonString(
                getClass().getResource("/queries/collections/collection_textClass_2.jsonld").getFile());
        ks = new Krill(jsonString);
        kc = ks.getCollection();
        kc.setIndex(ki);
        assertEquals(2, kc.numberOf("documents"));

        /*
        TokenStream ts = fd2.doc.getField("author").tokenStream(
        (Analyzer) ki.writer().getAnalyzer(),
        (TokenStream) null
                                                              );
        // OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
            
        ts.reset();
        while (ts.incrementToken()) {
        String term = charTermAttribute.toString();
        System.err.println(">>" + term + "<<");
        };
        */

        // author = wolfgang
        jsonString = getJsonString(
                getClass().getResource("/queries/collections/collection_goethe.jsonld").getFile());
        ks = new Krill(jsonString);
        kc = ks.getCollection();
        kc.setIndex(ki);
        assertEquals(1, kc.numberOf("documents"));

        // author = Wolfgang
        jsonString = getJsonString(
                getClass().getResource("/queries/collections/collection_goethe_2.jsonld").getFile());
        ks = new Krill(jsonString);
        kc = ks.getCollection();
        kc.setIndex(ki);
        assertEquals(1, kc.numberOf("documents"));

        Result kr = ks.apply(ki);

        ObjectMapper mapper = new ObjectMapper();
        JsonNode res = mapper.readTree(kr.toJsonString());
        assertEquals(1, res.at("/meta/totalResults").asInt());
    };

    @Test
    public void searchMetaContext() throws IOException {

        // All fields
        String jsonString = getJsonString(
                getClass().getResource("/queries/metas/context_paragraph.jsonld").getFile());

        Krill ks = new Krill(jsonString);
        assertTrue(ks.getMeta().getContext().isSpanDefined());
        assertEquals("base/p", ks.getMeta().getContext().getSpanContext());
    };
};