Java tutorial
// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.semantic.broadcast; import static org.junit.Assert.assertEquals; import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.*; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import org.talend.dataquality.semantic.api.CategoryRegistryManager; import org.talend.dataquality.semantic.api.DictionaryUtils; import org.talend.dataquality.semantic.index.ClassPathDirectory; import org.talend.dataquality.semantic.index.DictionarySearcher; public class BroadcastIndexObjectTest { private static final Map<String, String[]> TEST_INDEX_CONTENT = new LinkedHashMap<String, String[]>() { private static final long serialVersionUID = 1L; { put("AIRPORT", new String[] { "CDG" }); put("COMPANY", new String[] { "Talend SA" }); put("STREET_TYPE", new String[] { "BOULEVARD", "BD" }); } }; @Test public void testCreateFromObjects() throws Exception { // given final BroadcastDocumentObject object = new BroadcastDocumentObject("CATEGORY", Collections.singleton("Value")); final BroadcastIndexObject bio = new BroadcastIndexObject(Collections.singletonList(object)); try (Directory directory = bio.get()) { // when // then DirectoryReader directoryReader = DirectoryReader.open(directory); Document ramDoc = directoryReader.document(0); String word = ramDoc.getField(DictionarySearcher.F_CATID).stringValue(); assertEquals(1, directoryReader.numDocs()); assertEquals("CATEGORY", word); } } @Test public void testCreateAndGet() throws URISyntaxException, IOException { // init a local index final File testFolder = new File("target/broadcast"); if (testFolder.exists()) { FileUtils.deleteDirectory(testFolder); } try { FSDirectory testDir = FSDirectory.open(testFolder); IndexWriter writer = new IndexWriter(testDir, new IndexWriterConfig(Version.LATEST, new StandardAnalyzer(CharArraySet.EMPTY_SET))); if (writer.maxDoc() > 0) { writer.deleteAll(); writer.commit(); } for (String key : TEST_INDEX_CONTENT.keySet()) { Document doc = DictionaryUtils.generateDocument("TEST", key, key, new HashSet<>(Arrays.asList(TEST_INDEX_CONTENT.get(key)))); writer.addDocument(doc); } // here we add an extra document and remove it later. Document doc = DictionaryUtils.generateDocument("TEST", "DE_LAND", "DE_LAND", new HashSet<>(Arrays.asList(new String[] { "Bayern" }))); writer.addDocument(doc); writer.commit(); // when a document is deleted from lucene index, it's marked as deleted, but not physically deleted. // we need to assure that it's not propagated to Spark cluster writer.deleteDocuments(new Term(DictionarySearcher.F_CATID, "DE_LAND")); writer.commit(); writer.close(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } // create the broadcast object from local index final Directory cpDir = ClassPathDirectory.open(testFolder.toURI()); final BroadcastIndexObject bio = new BroadcastIndexObject(cpDir, true); // get the RamDirectory from BroadcastIndexObject final Directory ramDir = bio.get(); // assertions try { DirectoryReader cpDirReader = DirectoryReader.open(cpDir); assertEquals("Unexpected document count in created index. ", TEST_INDEX_CONTENT.size(), cpDirReader.numDocs()); DirectoryReader ramDirReader = DirectoryReader.open(ramDir); assertEquals("Unexpected document count in created index. ", TEST_INDEX_CONTENT.size(), ramDirReader.numDocs()); for (int i = 0; i < TEST_INDEX_CONTENT.size(); i++) { Document doc = cpDirReader.document(i); String cpWord = doc.getField(DictionarySearcher.F_CATID).stringValue(); Document ramDoc = ramDirReader.document(i); String ramWord = ramDoc.getField(DictionarySearcher.F_CATID).stringValue(); assertEquals("Unexpected word", cpWord, ramWord); } } catch (IOException e) { e.printStackTrace(); } } @Test public void testCreateWithOpenCategories() throws URISyntaxException { URI uri = CategoryRegistryManager.getInstance().getDictionaryURI(); final Directory cpDir = ClassPathDirectory.open(uri); final BroadcastIndexObject bio = new BroadcastIndexObject(cpDir, true); assertEquals("Unexpected Document Size!", 250078, bio.getDocumentList().size()); } @Test public void testCreateWithoutOpenCategories() throws URISyntaxException { URI uri = CategoryRegistryManager.getInstance().getDictionaryURI(); final Directory cpDir = ClassPathDirectory.open(uri); final BroadcastIndexObject bio = new BroadcastIndexObject(cpDir, false); assertEquals("Unexpected Document Size!", 45698, bio.getDocumentList().size()); } }