Révision 3281
TXM/main/org.txm.groovy.core/src/groovy/org/txm/scripts/filters/Tokeniser/SimpleTokenizerXml.groovy (revision 3281) | ||
---|---|---|
523 | 523 |
writer.writeCharacters("\n"); |
524 | 524 |
} |
525 | 525 |
if (stringTokenizer.doSentences()) { |
526 |
writer.writeProcessingInstruction("txm", "</s>") |
|
526 |
writer.writeProcessingInstruction("txm", "</s>\n")
|
|
527 | 527 |
} |
528 | 528 |
} |
529 | 529 |
} |
TXM/main/org.txm.tokenizer.core/src/org/txm/tokenizer/StringTokenizer.java (revision 3281) | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.util.List; |
4 | 4 |
|
5 |
/** |
|
6 |
* Abstract class of a String tokenizer |
|
7 |
* |
|
8 |
* @author mdecorde |
|
9 |
* |
|
10 |
*/ |
|
5 | 11 |
public interface StringTokenizer { |
6 | 12 |
|
13 |
/** |
|
14 |
* |
|
15 |
* @param text |
|
16 |
* @return the tokenized string : a list of sentences containing a list of tokens |
|
17 |
*/ |
|
7 | 18 |
List<List<String>> processText(String text); |
8 | 19 |
|
20 |
/** |
|
21 |
* |
|
22 |
* @return true if the String tokenizer detects sentences |
|
23 |
*/ |
|
9 | 24 |
boolean doSentences(); |
10 | 25 |
} |
Formats disponibles : Unified diff