Révision 4025
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TSCorpus.java (revision 4025) | ||
---|---|---|
219 | 219 |
* @param tigerPosition |
220 | 220 |
* @return 1 if the TIGER position has a position in CQP ; 0 if not |
221 | 221 |
*/ |
222 |
public int getPresence(int tigerPosition) {
|
|
222 |
public byte getPresence(int tigerPosition) {
|
|
223 | 223 |
if (presencesMapped != null) { |
224 |
return presencesMapped.getInt(tigerPosition * Integer.BYTES);
|
|
224 |
return presencesMapped.get(tigerPosition);
|
|
225 | 225 |
} |
226 | 226 |
else { |
227 | 227 |
return 0; |
... | ... | |
233 | 233 |
* @param tigerPositions |
234 | 234 |
* @return 1 if the TIGER position has a position in CQP ; 0 if not |
235 | 235 |
*/ |
236 |
public int[] getPresences(int tigerPositions[]) {
|
|
237 |
int[] ret = new int[tigerPositions.length];
|
|
236 |
public byte[] getPresences(int tigerPositions[]) {
|
|
237 |
byte[] ret = new byte[tigerPositions.length];
|
|
238 | 238 |
if (presencesMapped != null) { |
239 | 239 |
for (int i = 0; i < tigerPositions.length; i++) { |
240 |
ret[i] = presencesMapped.getInt(tigerPositions[i] * Integer.BYTES);
|
|
240 |
ret[i] = presencesMapped.get(tigerPositions[i] * Integer.BYTES); |
|
241 | 241 |
} |
242 | 242 |
} |
243 | 243 |
|
TXM/trunk/bundles/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 4025) | ||
---|---|---|
42 | 42 |
import org.txm.utils.io.IOUtils; |
43 | 43 |
import org.txm.utils.logger.Log; |
44 | 44 |
|
45 |
import cern.colt.Arrays; |
|
45 | 46 |
import ims.tiger.corpus.Feature; |
46 | 47 |
import ims.tiger.corpus.Header; |
47 | 48 |
import ims.tiger.corpus.Sentence; |
... | ... | |
258 | 259 |
int iPivot = variables.indexOf("pivot"); //$NON-NLS-1$ |
259 | 260 |
|
260 | 261 |
MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped(); |
262 |
MappedByteBuffer presenceMapped = tcorpus.getPresencesMapped(); |
|
261 | 263 |
// MappedByteBuffer offsetsMapped = tcorpus.getOffsetsMapped(); |
264 |
|
|
265 |
ArrayList<String> warnings = new ArrayList<String>(); |
|
262 | 266 |
|
263 | 267 |
boolean useSubMatches = TigerSearchTreePreferences.getInstance().getBoolean(TigerSearchTreePreferences.USESUBMATCHES); |
264 | 268 |
|
... | ... | |
282 | 286 |
if (iPivot != -1 && i != iPivot) continue; // skip match that are not 'pivot' |
283 | 287 |
|
284 | 288 |
int left = sent_start + index.getLeftCorner(sent, match[i]); |
285 |
if (offsetsMapped != null) { // the TIGER token is not in the CQP corpus |
|
286 |
left += offsetsMapped.getInt(left * Integer.BYTES); |
|
287 |
// System.out.println("left="+left+" offset="+offsetsMapped.getInt(left*Integer.BYTES)); |
|
288 |
} |
|
289 | 289 |
int right = sent_start + index.getRightCorner(sent, match[i]); |
290 |
if (offsetsMapped != null) { // the TIGER token is not in the CQP corpus |
|
291 |
right += offsetsMapped.getInt(right * Integer.BYTES); |
|
292 |
} |
|
293 |
// System.out.println(" M="+match[i]+" ("+left+", "+right+")"); |
|
290 |
|
|
291 |
// test if the match position is also in the CQP positions |
|
292 |
if (presenceMapped.get(left) > 0 && presenceMapped.get(right) > 0) { |
|
294 | 293 |
|
295 |
TIGERMatch tigerMatch = new TIGERMatch(left, right); |
|
294 |
if (offsetsMapped != null && presenceMapped != null) { // the TIGER token is not in the CQP corpus |
|
295 |
if (presenceMapped.get(left) > 0) { |
|
296 |
left += offsetsMapped.getInt(left * Integer.BYTES); |
|
297 |
} |
|
298 |
// System.out.println("left="+left+" offset="+offsetsMapped.getInt(left*Integer.BYTES)); |
|
299 |
} |
|
296 | 300 |
|
297 |
// System.out.println(" ajusted="+(tigerMatch)); |
|
298 |
tigerMatchesList.add(tigerMatch); |
|
301 |
if (offsetsMapped != null && presenceMapped != null) { // the TIGER token is not in the CQP corpus |
|
302 |
if (presenceMapped.get(right) > 0) { |
|
303 |
right += offsetsMapped.getInt(right * Integer.BYTES); |
|
304 |
} |
|
305 |
} |
|
306 |
// System.out.println(" M="+match[i]+" ("+left+", "+right+")"); |
|
299 | 307 |
|
300 |
if (!useSubMatches) { // use only the first submatch |
|
301 |
break; |
|
308 |
TIGERMatch tigerMatch = new TIGERMatch(left, right); |
|
309 |
|
|
310 |
// System.out.println(" ajusted="+(tigerMatch)); |
|
311 |
tigerMatchesList.add(tigerMatch); |
|
312 |
|
|
313 |
if (!useSubMatches) { // use only the first submatch |
|
314 |
break; |
|
315 |
} |
|
316 |
} else { |
|
317 |
warnings.add("<"+left+", "+right+">"); |
|
302 | 318 |
} |
303 | 319 |
} |
304 | 320 |
} |
305 | 321 |
} |
322 |
|
|
323 |
if (warnings.size() > 0) { |
|
324 |
Log.warning("Some TIGER matches are not in the CQP corpus: "+StringUtils.join(warnings, ", ")); |
|
325 |
} |
|
306 | 326 |
|
307 | 327 |
// intersect with corpus matches |
308 | 328 |
List<? extends Match> result2 = Match.intersect(corpus.getMatches(), new ArrayList<>(tigerMatchesList), true); |
... | ... | |
526 | 546 |
int[] ids_idx = CQI.str2Id(corpus.getProperty("id").getQualifiedName(), ids); //$NON-NLS-1$ |
527 | 547 |
Integer[] cqpPositions = new Integer[sent_size]; |
528 | 548 |
Integer[] offsets = new Integer[sent_size]; |
549 |
boolean error = false; |
|
529 | 550 |
for (int t = 0; t < sent_size; t++) { |
530 | 551 |
if (ids_idx[t] >= 0) { |
531 | 552 |
int[] positions = CQI.id2Cpos(corpus.getProperty("id").getQualifiedName(), ids_idx[t]); //$NON-NLS-1$ |
... | ... | |
536 | 557 |
} |
537 | 558 |
else { // word not in the CQP corpus |
538 | 559 |
Log.warning("Could not find word for id=" + ids[t]); |
560 |
|
|
539 | 561 |
cqpPositions[t] = null; |
562 |
error = true; |
|
540 | 563 |
} |
541 | 564 |
|
542 | 565 |
if (cqpPositions[t] != null) { |
... | ... | |
546 | 569 |
offsets[t] = null; |
547 | 570 |
} |
548 | 571 |
} |
572 |
if (error) { |
|
573 |
Log.warning(" IDS =" + " "+ids.length+" "+Arrays.toString(ids)); |
|
574 |
Log.warning(" IDS_IDX =" + " "+ids_idx.length+" "+Arrays.toString(ids_idx)); |
|
575 |
Log.warning(" CQP =" + " "+cqpPositions.length+" "+Arrays.toString(cqpPositions)); |
|
576 |
Log.warning(" TIGER =" + " "+tigerPositions.length+" "+Arrays.toString(tigerPositions)); |
|
577 |
Log.warning(" OFFSET =" + " "+offsets.length+" "+Arrays.toString(offsets)); |
|
578 |
} |
|
549 | 579 |
// System.out.println("ids="+Arrays.toString(ids)); |
550 | 580 |
// System.out.println("cqp indexes="+Arrays.toString(ids_idx)); |
551 | 581 |
// System.out.println("tiger positions="+Arrays.toString(tigerPositions)); |
TXM/trunk/bundles/org.txm.conllu.core/groovy/org/txm/scripts/importer/conllu/CoNLLUImporter.groovy (revision 4025) | ||
---|---|---|
64 | 64 |
files.sort() |
65 | 65 |
|
66 | 66 |
println "Add XmlId if necessary & remove empty nodes" |
67 |
|
|
68 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT)); |
|
69 |
|
|
67 | 70 |
ConsoleProgressBar cpb_texts = new ConsoleProgressBar(files.size()) |
68 | 71 |
for (File conlluFile : files) { |
69 | 72 |
cpb_texts.tick() |
... | ... | |
85 | 88 |
continue; // next ! |
86 | 89 |
} |
87 | 90 |
} |
91 |
|
|
88 | 92 |
def temp_multiwords = [:] |
89 |
|
|
90 | 93 |
for (int i = 0 ; i < lines.size() ; i++) { |
91 | 94 |
String line = lines[i] |
92 | 95 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue; |
93 | 96 |
|
94 | 97 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
95 | 98 |
|
96 |
if (split[-1] != null && !split[-1].contains("XmlId=")) { |
|
99 |
if (temp_multiwords.containsKey(split[0])) { // this word XMLid must be the same as its multiword id, see below |
|
100 |
String id = temp_multiwords.remove(split[0]); |
|
97 | 101 |
if (split[-1] == "_") { |
98 |
split[-1] = "XmlId=w_"+textid+"_"+(wcounter++)
|
|
102 |
split[-1] = "XmlId="+id
|
|
99 | 103 |
} else { |
100 |
split[-1] += "|XmlId=w_"+textid+"_"+(wcounter++)
|
|
104 |
split[-1] += "|XmlId="+id
|
|
101 | 105 |
} |
106 |
} else { |
|
107 |
|
|
108 |
if (split[-1] != null && !split[-1].contains("XmlId=")) { // There is no XmlID -> create one and manage subwords |
|
109 |
String id = "w_"+textid+"_"+(wcounter++); |
|
110 |
if (split[-1] == "_") { |
|
111 |
split[-1] = "XmlId="+id |
|
112 |
} else { |
|
113 |
split[-1] += "|XmlId="+id |
|
114 |
} |
|
115 |
|
|
116 |
if (split[0].contains("-") && contractionsManagement == "surface") { |
|
117 |
temp_multiwords = [:] // reset to avoid using another multiwords |
|
118 |
String[] fromstart= split[0].split("-", 2) |
|
119 |
int pfrom = Integer.parseInt(fromstart[0]) |
|
120 |
int pend = Integer.parseInt(fromstart[1]) |
|
121 |
for (int p = pfrom ; p <= pend ; p++) { |
|
122 |
temp_multiwords.put(""+p, id) |
|
123 |
} |
|
124 |
println temp_multiwords |
|
125 |
} |
|
126 |
} |
|
102 | 127 |
} |
103 | 128 |
|
104 | 129 |
lines[i] = split.join("\t") // rebuild the line |
... | ... | |
111 | 136 |
|
112 | 137 |
// Keep or not contractions |
113 | 138 |
File conlluSrcForTXMDirectory = new File(outputDirectory.getParentFile().getParentFile(), "conllu-fortxm") |
114 |
|
|
115 |
String contractionsManagement = UDPreferences.getInstance().getProjectPreferenceValue(project, UDPreferences.CONTRACTIONS_MANAGEMENT, UDPreferences.getInstance().getString(UDPreferences.CONTRACTIONS_MANAGEMENT)); |
|
116 | 139 |
|
117 | 140 |
if (contractionsManagement == UDPreferences.ALL) { |
118 | 141 |
conlluSrcForTXMDirectory = conlluSrcDirectory; // use the same directory as TIGER since no word modifications have been done |
119 | 142 |
} else { |
120 |
|
|
143 |
|
|
121 | 144 |
conlluSrcForTXMDirectory.deleteDir() |
122 | 145 |
conlluSrcForTXMDirectory.mkdirs() |
123 |
|
|
124 |
println "Contractions managment mode is '$contractionsManagement'" |
|
146 |
|
|
147 |
println "Contractions management mode is '$contractionsManagement'"
|
|
125 | 148 |
cpb_texts = new ConsoleProgressBar(files.size()) |
126 | 149 |
for (File conlluFile : files) { |
127 | 150 |
cpb_texts.tick() |
... | ... | |
142 | 165 |
if (line.length() == 0 || line.startsWith("#") || !line.contains("\t")) continue; |
143 | 166 |
|
144 | 167 |
def split = line.split("\t", ImportCoNLLUAnnotations.UD_PROPERTY_NAMES.length); |
145 |
|
|
168 |
|
|
146 | 169 |
if (contractionsManagement == UDPreferences.SYNTAX) { |
147 | 170 |
if (split[0].contains("-")) { |
148 | 171 |
|
149 | 172 |
// stores the syntatic word id and the ortographic word properties |
150 |
temp_multiwords = [:] |
|
173 |
temp_multiwords = [:] // reset to avoid using another multiwords
|
|
151 | 174 |
int n1 = Integer.parseInt(split[0].substring(0, split[0].indexOf("-"))); |
152 | 175 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-"))); |
153 | 176 |
for (int ii = n1 ; ii <= n2 ; ii++) { |
TXM/trunk/bundles/org.txm.conllu.core/src/org/txm/conllu/core/function/BratPrintTree.java (revision 4025) | ||
---|---|---|
30 | 30 |
int n2 = Integer.parseInt(split[0].substring(1 + split[0].indexOf("-"))); |
31 | 31 |
int n = n2 - n1; |
32 | 32 |
|
33 |
//System.out.println("Word "+Arrays.toString(split)); |
|
34 |
//System.out.println("lines to insert: "+n); |
|
35 | 33 |
if ( !(splittedLines.get(i+1)[0].equals(""+n1)) || !(splittedLines.get(i+n+1)[0].equals(""+n2)) ) { |
36 | 34 |
|
37 | 35 |
ArrayList<String[]> newlines = new ArrayList<>(); |
... | ... | |
73 | 71 |
} else { |
74 | 72 |
//System.out.println("NOT FIXING "+conll.get(i)); |
75 | 73 |
} |
76 |
|
|
77 |
|
|
78 | 74 |
} |
79 | 75 |
} |
80 | 76 |
|
TXM/trunk/bundles/org.txm.treetagger.core/META-INF/MANIFEST.MF (revision 4025) | ||
---|---|---|
5 | 5 |
Bundle-SymbolicName: org.txm.treetagger.core;singleton:=true |
6 | 6 |
Bundle-Version: 1.0.0.qualifier |
7 | 7 |
Bundle-Name: TreeTagger Core |
8 |
Require-Bundle: org.txm.nlp.core;bundle-version="1.0.0",
|
|
9 |
org.txm.core
|
|
8 |
Require-Bundle: org.txm.core,
|
|
9 |
org.txm.nlp.core;bundle-version="1.0.0"
|
|
10 | 10 |
Bundle-ActivationPolicy: lazy |
11 | 11 |
Bundle-ManifestVersion: 2 |
12 | 12 |
Bundle-RequiredExecutionEnvironment: JavaSE-16 |
TXM/trunk/bundles/org.txm.udpipe.core/plugin.xml (revision 4025) | ||
---|---|---|
2 | 2 |
<?eclipse version="3.4"?> |
3 | 3 |
<plugin> |
4 | 4 |
<extension |
5 |
point="org.txm.annotation.core.AnnotationEngine">
|
|
5 |
point="org.txm.nlp.core.NLPEngine">
|
|
6 | 6 |
<AnnotationEngine |
7 | 7 |
class="org.txm.udpipe.core.UDPipeEngine" |
8 | 8 |
description="UDPipe wrapper"> |
Formats disponibles : Unified diff