Révision 4025
| TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TSCorpus.java (revision 4025) | ||
|---|---|---|
| 219 | 219 |
* @param tigerPosition |
| 220 | 220 |
* @return 1 if the TIGER position has a position in CQP ; 0 if not |
| 221 | 221 |
*/ |
| 222 |
public int getPresence(int tigerPosition) {
|
|
| 222 |
public byte getPresence(int tigerPosition) {
|
|
| 223 | 223 |
if (presencesMapped != null) {
|
| 224 |
return presencesMapped.getInt(tigerPosition * Integer.BYTES);
|
|
| 224 |
return presencesMapped.get(tigerPosition);
|
|
| 225 | 225 |
} |
| 226 | 226 |
else {
|
| 227 | 227 |
return 0; |
| ... | ... | |
| 233 | 233 |
* @param tigerPositions |
| 234 | 234 |
* @return 1 if the TIGER position has a position in CQP ; 0 if not |
| 235 | 235 |
*/ |
| 236 |
public int[] getPresences(int tigerPositions[]) {
|
|
| 237 |
int[] ret = new int[tigerPositions.length];
|
|
| 236 |
public byte[] getPresences(int tigerPositions[]) {
|
|
| 237 |
byte[] ret = new byte[tigerPositions.length];
|
|
| 238 | 238 |
if (presencesMapped != null) {
|
| 239 | 239 |
for (int i = 0; i < tigerPositions.length; i++) {
|
| 240 |
ret[i] = presencesMapped.getInt(tigerPositions[i] * Integer.BYTES);
|
|
| 240 |
ret[i] = presencesMapped.get(tigerPositions[i] * Integer.BYTES); |
|
| 241 | 241 |
} |
| 242 | 242 |
} |
| 243 | 243 |
|
| TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 4025) | ||
|---|---|---|
| 42 | 42 |
import org.txm.utils.io.IOUtils; |
| 43 | 43 |
import org.txm.utils.logger.Log; |
| 44 | 44 |
|
| 45 |
import cern.colt.Arrays; |
|
| 45 | 46 |
import ims.tiger.corpus.Feature; |
| 46 | 47 |
import ims.tiger.corpus.Header; |
| 47 | 48 |
import ims.tiger.corpus.Sentence; |
| ... | ... | |
| 258 | 259 |
int iPivot = variables.indexOf("pivot"); //$NON-NLS-1$
|
| 259 | 260 |
|
| 260 | 261 |
MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped(); |
| 262 |
MappedByteBuffer presenceMapped = tcorpus.getPresencesMapped(); |
|
| 261 | 263 |
// MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped(); |
| 264 |
|
|
| 265 |
ArrayList<String> warnings = new ArrayList<String>(); |
|
| 262 | 266 |
|
| 263 | 267 |
boolean useSubMatches = TigerSearchTreePreferences.getInstance().getBoolean(TigerSearchTreePreferences.USESUBMATCHES); |
| 264 | 268 |
|
| ... | ... | |
| 282 | 286 |
if (iPivot != -1 && i != iPivot) continue; // skip match that are not 'pivot' |
| 283 | 287 |
|
| 284 | 288 |
int left = sent_start + index.getLeftCorner(sent, match[i]); |
| 285 |
if (offsetsMapped != null) { // the TIGER token is not in the CQP corpus
|
|
| 286 |
left += offsetsMapped.getInt(left * Integer.BYTES); |
|
| 287 |
// System.out.println("left="+left+" offset="+offsetsMapped.getInt(left*Integer.BYTES));
|
|
| 288 |
} |
|
| 289 | 289 |
int right = sent_start + index.getRightCorner(sent, match[i]); |
| 290 |
if (offsetsMapped != null) { // the TIGER token is not in the CQP corpus
|
|
| 291 |
right += offsetsMapped.getInt(right * Integer.BYTES); |
|
| 292 |
} |
|
| 293 |
// System.out.println(" M="+match[i]+" ("+left+", "+right+")");
|
|
| 290 |
|
|
| 291 |
// test if the match position is also in the CQP positions |
|
| 292 |
if (presenceMapped.get(left) > 0 && presenceMapped.get(right) > 0) {
|
|
| 294 | 293 |
|
| 295 |
TIGERMatch tigerMatch = new TIGERMatch(left, right); |
|
| 294 |
if (offsetsMapped != null && presenceMapped != null) { // the TIGER token is not in the CQP corpus
|
|
| 295 |
if (presenceMapped.get(left) > 0) {
|
|
| 296 |
left += offsetsMapped.getInt(left * Integer.BYTES); |
|
| 297 |
} |
|
| 298 |
// System.out.println("left="+left+" offset="+offsetsMapped.getInt(left*Integer.BYTES));
|
|
| 299 |
} |
|
| 296 | 300 |
|
| 297 |
// System.out.println(" ajusted="+(tigerMatch));
|
|
| 298 |
tigerMatchesList.add(tigerMatch); |
|
| 301 |
if (offsetsMapped != null && presenceMapped != null) { // the TIGER token is not in the CQP corpus
|
|
| 302 |
if (presenceMapped.get(right) > 0) {
|
|
| 303 |
right += offsetsMapped.getInt(right * Integer.BYTES); |
|
| 304 |
} |
|
| 305 |
} |
|
| 306 |
// System.out.println(" M="+match[i]+" ("+left+", "+right+")");
|
|
| 299 | 307 |
|
| 300 |
if (!useSubMatches) { // use only the first submatch
|
|
| 301 |
break; |
|
| 308 |
TIGERMatch tigerMatch = new TIGERMatch(left, right); |
|
| 309 |
|
|
| 310 |
// System.out.println(" ajusted="+(tigerMatch));
|
|
| 311 |
tigerMatchesList.add(tigerMatch); |
|
| 312 |
|
|
| 313 |
if (!useSubMatches) { // use only the first submatch
|
|
| 314 |
break; |
|
| 315 |
} |
|
| 316 |
} else {
|
|
| 317 |
warnings.add("<"+left+", "+right+">");
|
|
| 302 | 318 |
} |
| 303 | 319 |
} |
| 304 | 320 |
} |
| 305 | 321 |
} |
| 322 |
|
|
| 323 |
if (warnings.size() > 0) {
|
|
| 324 |
Log.warning("Some TIGER matches are not in the CQP corpus: "+StringUtils.join(warnings, ", "));
|
|
| 325 |
} |
|
| 306 | 326 |
|
| 307 | 327 |
// intersect with corpus matches |
| 308 | 328 |
List<? extends Match> result2 = Match.intersect(corpus.getMatches(), new ArrayList<>(tigerMatchesList), true); |
| ... | ... | |
| 526 | 546 |
int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids); //$NON-NLS-1$
|
| 527 | 547 |
Integer[] cqpPositions = new Integer[sent_size]; |
| 528 | 548 |
Integer[] offsets = new Integer[sent_size]; |
| 549 |
boolean error = false; |
|
| 529 | 550 |
for (int t = 0; t < sent_size; t++) {
|
| 530 | 551 |
if (ids_idx[t] >= 0) {
|
| 531 | 552 |
int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]); //$NON-NLS-1$
|
| ... | ... | |
| 536 | 557 |
} |
| 537 | 558 |
else { // word not in the CQP corpus
|
| 538 | 559 |
Log.warning("Could not find word for id=" + ids[t]);
|
| 560 |
|
|
| 539 | 561 |
cqpPositions[t] = null; |
| 562 |
error = true; |
|
| 540 | 563 |
} |
| 541 | 564 |
|
| 542 | 565 |
if (cqpPositions[t] != null) {
|
| ... | ... | |
| 546 | 569 |
offsets[t] = null; |
| 547 | 570 |
} |
| 548 | 571 |
} |
| 572 |
if (error) {
|
|
| 573 |
Log.warning(" IDS =" + " "+ids.length+" "+Arrays.toString(ids));
|
|
| 574 |
Log.warning(" IDS_IDX =" + " "+ids_idx.length+" "+Arrays.toString(ids_idx));
|
|
| 575 |
Log.warning(" CQP =" + " "+cqpPositions.length+" "+Arrays.toString(cqpPositions));
|
|
| 576 |
Log.warning(" TIGER =" + " "+tigerPositions.length+" "+Arrays.toString(tigerPositions));
|
|
| 577 |
Log.warning(" OFFSET =" + " "+offsets.length+" "+Arrays.toString(offsets));
|
|
| 578 |
} |
|
| 549 | 579 |
// System.out.println("ids="+Arrays.toString(ids));
|
| 550 | 580 |
// System.out.println("cqp indexes="+Arrays.toString(ids_idx));
|
| 551 | 581 |
// System.out.println("tiger positions="+Arrays.toString(tigerPositions));
|
| TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4025) | ||
|---|---|---|
| 64 | 64 |
files.sort() |
| 65 | 65 |
|
| 66 | 66 |
println "Add XmlId if necessary & remove empty nodes" |
| 67 |
|
|
| 68 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT)); |
|
| 69 |
|
|
| 67 | 70 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
| 68 | 71 |
for (File conlluFile : files) {
|
| 69 | 72 |
cpb_texts.tick() |
| ... | ... | |
| 85 | 88 |
continue; // next ! |
| 86 | 89 |
} |
| 87 | 90 |
} |
| 91 |
|
|
| 88 | 92 |
def temp_multiwords = [:] |
| 89 |
|
|
| 90 | 93 |
for (int i = 0 ; i < lines.size() ; i++) {
|
| 91 | 94 |
String line = lines[i] |
| 92 | 95 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
| 93 | 96 |
|
| 94 | 97 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
| 95 | 98 |
|
| 96 |
if (split[-1] != null && !split[-1].contains("XmlId=")) {
|
|
| 99 |
if (temp_multiwords.containsKey(split[0])) { // this word XMLid must be the same as its multiword id, see below
|
|
| 100 |
String id = temp_multiwords.remove(split[0]); |
|
| 97 | 101 |
if (split[-1] == "_") {
|
| 98 |
split[-1] = "XmlId=w_"+textid+"_"+(wcounter++)
|
|
| 102 |
split[-1] = "XmlId="+id
|
|
| 99 | 103 |
} else {
|
| 100 |
split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
|
|
| 104 |
split[-1] += "|XmlId="+id
|
|
| 101 | 105 |
} |
| 106 |
} else {
|
|
| 107 |
|
|
| 108 |
if (split[-1] != null && !split[-1].contains("XmlId=")) { // There is no XmlID -> create one and manage subwords
|
|
| 109 |
String id = "w_"+textid+"_"+(wcounter++); |
|
| 110 |
if (split[-1] == "_") {
|
|
| 111 |
split[-1] = "XmlId="+id |
|
| 112 |
} else {
|
|
| 113 |
split[-1] += "|XmlId="+id |
|
| 114 |
} |
|
| 115 |
|
|
| 116 |
if (split[0].contains("-") && contractionsManagement == "surface") {
|
|
| 117 |
temp_multiwords = [:] // reset to avoid using another multiwords |
|
| 118 |
String[] fromstart= split[0].split("-", 2)
|
|
| 119 |
int pfrom = Integer.parseInt(fromstart[0]) |
|
| 120 |
int pend = Integer.parseInt(fromstart[1]) |
|
| 121 |
for (int p = pfrom ; p <= pend ; p++) {
|
|
| 122 |
temp_multiwords.put(""+p, id)
|
|
| 123 |
} |
|
| 124 |
println temp_multiwords |
|
| 125 |
} |
|
| 126 |
} |
|
| 102 | 127 |
} |
| 103 | 128 |
|
| 104 | 129 |
lines[i] = split.join("\t") // rebuild the line
|
| ... | ... | |
| 111 | 136 |
|
| 112 | 137 |
// Keep or not contractions |
| 113 | 138 |
File conlluSrcForTXMDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu-fortxm") |
| 114 |
|
|
| 115 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT)); |
|
| 116 | 139 |
|
| 117 | 140 |
if (contractionsManagement == UDPreferences.ALL) {
|
| 118 | 141 |
conlluSrcForTXMDirectory = conlluSrcDirectory; // use the same directory as TIGER since no word modifications have been done |
| 119 | 142 |
} else {
|
| 120 |
|
|
| 143 |
|
|
| 121 | 144 |
conlluSrcForTXMDirectory.deleteDir() |
| 122 | 145 |
conlluSrcForTXMDirectory.mkdirs() |
| 123 |
|
|
| 124 |
println "Contractions managment mode is '$contractionsManagement'" |
|
| 146 |
|
|
| 147 |
println "Contractions management mode is '$contractionsManagement'"
|
|
| 125 | 148 |
cpb_texts = new ConsoleProgressBar(files.size()) |
| 126 | 149 |
for (File conlluFile : files) {
|
| 127 | 150 |
cpb_texts.tick() |
| ... | ... | |
| 142 | 165 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue;
|
| 143 | 166 |
|
| 144 | 167 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length);
|
| 145 |
|
|
| 168 |
|
|
| 146 | 169 |
if (contractionsManagement == UDPreferences.SYNTAX) {
|
| 147 | 170 |
if (split[0].contains("-")) {
|
| 148 | 171 |
|
| 149 | 172 |
// stores the syntatic word id and the ortographic word properties |
| 150 |
temp_multiwords = [:] |
|
| 173 |
temp_multiwords = [:] // reset to avoid using another multiwords
|
|
| 151 | 174 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-")));
|
| 152 | 175 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
| 153 | 176 |
for (int ii = n1 ; ii <= n2 ; ii++) {
|
| TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/function/BratPrintTree.java (revision 4025) | ||
|---|---|---|
| 30 | 30 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-")));
|
| 31 | 31 |
int n = n2 - n1; |
| 32 | 32 |
|
| 33 |
//System.out.println("Word "+Arrays.toString(split));
|
|
| 34 |
//System.out.println("lines to insert: "+n);
|
|
| 35 | 33 |
if ( !(splittedLines.get(i+1)[0].equals(""+n1)) || !(splittedLines.get(i+n+1)[0].equals(""+n2)) ) {
|
| 36 | 34 |
|
| 37 | 35 |
ArrayList<String[]> newlines = new ArrayList<>(); |
| ... | ... | |
| 73 | 71 |
} else {
|
| 74 | 72 |
//System.out.println("NOT FIXING "+conll.get(i));
|
| 75 | 73 |
} |
| 76 |
|
|
| 77 |
|
|
| 78 | 74 |
} |
| 79 | 75 |
} |
| 80 | 76 |
|
| TXM/trunk/bundles/org.txm.treetagger.core/META-INF/MANIFEST.MF (revision 4025) | ||
|---|---|---|
| 5 | 5 |
Bundle-SymbolicName: org.txm.treetagger.core;singleton:=true |
| 6 | 6 |
Bundle-Version: 1.0.0.qualifier |
| 7 | 7 |
Bundle-Name: TreeTagger Core |
| 8 |
Require-Bundle: org.txm.nlp.core;bundle-version="1.0.0",
|
|
| 9 |
org.txm.core
|
|
| 8 |
Require-Bundle: org.txm.core,
|
|
| 9 |
org.txm.nlp.core;bundle-version="1.0.0"
|
|
| 10 | 10 |
Bundle-ActivationPolicy: lazy |
| 11 | 11 |
Bundle-ManifestVersion: 2 |
| 12 | 12 |
Bundle-RequiredExecutionEnvironment: JavaSE-16 |
| TXM/trunk/bundles/org.txm.udpipe.core/plugin.xml (revision 4025) | ||
|---|---|---|
| 2 | 2 |
<?eclipse version="3.4"?> |
| 3 | 3 |
<plugin> |
| 4 | 4 |
<extension |
| 5 |
point="org.txm.annotation.core.AnnotationEngine">
|
|
| 5 |
point="org.txm.nlp.core.NLPEngine">
|
|
| 6 | 6 |
<AnnotationEngine |
| 7 | 7 |
class="org.txm.udpipe.core.UDPipeEngine" |
| 8 | 8 |
description="UDPipe wrapper"> |
Formats disponibles : Unified diff