Sie könnten einen benutzerdefinierten Analyzer verwenden, um Ihren XML-Stream zu analysieren. Ich habe einen gehackt, der beim Leerzeichen, '>', und '/' trennt, so dass XML-Tokens durch '<' und '>' identifiziert werden.
public class SpanQueryTests {
private IndexSearcher searcher;
private IndexReader reader;
private Analyzer analyzer;
static class XMLTokenizer extends CharTokenizer {
public XMLTokenizer(Reader input) {
super(input);
}
final static Set chars = ImmutableSet.of('/', '>');
@Override
protected boolean isTokenChar(char c) {
return !(Character.isWhitespace(c) || chars.contains(c));
}
}
@Before
public void setUp() throws Exception {
Directory dir = new RAMDirectory();
analyzer = new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new XMLTokenizer(reader);
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new XMLTokenizer(reader);
setPreviousTokenStream(tokenizer);
} else
tokenizer.reset(reader);
return tokenizer;
}
};
IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
ImmutableList docs = ImmutableList.of("<doc>text sample text <x>test</x> words lipsum words words " +
"<x>text</x> some other text </doc>",
"<foobar>test</foobar> some more text flop");
int id = 0;
for (String content: docs) {
Document doc = new Document();
doc.add(new Field("id", String.valueOf(id++), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
id++;
}
writer.close();
searcher = new IndexSearcher(dir);
reader = searcher.getIndexReader();
}
@After
public void tearDown() throws Exception {
searcher.close();
}
@Test
public void testTermNearQuery() throws Exception {
SpanTermQuery tq1 = new SpanTermQuery(new Term("content", "lipsum"));
dumpSpans(tq1);
SpanTermQuery tq2 = new SpanTermQuery(new Term("content", "other"));
dumpSpans(tq2);
SpanTermQuery tq3 = new SpanTermQuery(new Term("content", "<x"));
dumpSpans(tq3);
SpanNearQuery snq1 = new SpanNearQuery(new SpanQuery[] { tq1, tq3 }, 2, false);
dumpSpans(snq1);
SpanNearQuery snq2 = new SpanNearQuery(new SpanQuery[] { tq2, tq3 }, 2, false);
dumpSpans(snq2);
}
}
Die Ergebnisse sind:
query content:lipsum
<doc>text sample text <x>test</x> words <lipsum> words words <x>text</x> some other text </doc> (0.15467961)
query content:other
<doc>text sample text <x>test</x> words lipsum words words <x>text</x> some <other> text </doc> (0.15467961)
query content:<x
<doc>text sample text <<x> test</x> words lipsum words words <x>text</x> some other text </doc> (0.21875)
<doc>text sample text <x> test</x> words lipsum words words <<x> text</x> some other text </doc> (0.21875)
query spanNear([content:lipsum, content:<x], 2, false)
<doc>text sample text <x> test</x> words <lipsum> words words <x> text</x> some other text </doc> (0.19565594)
query spanNear([content:other, content:<x], 2, false)
KEINE Treffer