Révision 3540

TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/misc/TXM2CoNLL2009Macro.groovy (revision 3540)
1
package org.txm.macro.misc
2

  
3
import org.txm.scripts.export.conll2009.ToCoNLL2009
4

  
5
import java.io.File;
6
import org.txm.searchengine.cqp.corpus.*
7
import org.kohsuke.args4j.*
8
import groovy.transform.Field
9
import org.txm.rcp.swt.widget.parameters.*
10

  
11
if (!(corpusViewSelection instanceof CQPCorpus)) {
12
	println "Select a corpus or a sub-corpus"
13
	return;
14
}
15
def corpus = corpusViewSelection
16

  
17
@Field @Option(name="outputFile", usage="an example file", widget="File", required=true, def="conn_export.tsv")
18
File outputFile = new File("conn_export.tsv")
19
@Field @Option(name="encoding", usage="sentenceProperty", widget="String", required=true, def="UTF-8")
20
String encoding = "UTF-8"
21

  
22
@Field @Option(name="sentenceProperty", usage="sentenceProperty", widget="String", required=true, def="s_id")
23
String sentenceProperty = "s_id"
24
@Field @Option(name="posProperty", usage="sentenceProperty", widget="String", required=true, def="pos")
25
String posProperty = "frpos"
26
@Field @Option(name="lemmaProperty", usage="sentenceProperty", widget="String", required=true, def="lemma")
27
String lemmaProperty = "frlemma"
28

  
29
if (!ParametersDialog.open(this)) return;
30

  
31
outputFile = outputFile.getAbsoluteFile()
32

  
33
def split = sentenceProperty.split("_", 2)
34
StructuralUnitProperty s = corpus.getStructuralUnit(split[0]).getProperty(split[1])
35
Property word = corpus.getProperty("word")
36
Property lemma = corpus.getProperty(lemmaProperty)
37
Property pos = corpus.getProperty(posProperty)
38

  
39
if (s == null) { println "Error sentence property: $sentenceProperty"; return}
40
if (word == null) { println "Error no word property"; return}
41
if (pos == null) { println "Error pos property: $posProperty"; return}
42
if (lemma == null) { println "Error lemma property: $lemmaProperty"; return}
43

  
44

  
45
ToCoNLL2009 processor = new ToCoNLL2009()
46
processor.process(outputFile, corpus, s, word, lemma, pos, encoding)
47

  
48
println "Done: $outputFile"
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/export/conll2009/ToCoNLL2009.groovy (revision 3540)
14 14

  
15 15
public class ToCoNLL2009 {
16 16
	
17
	boolean debug = true
17
	boolean debug = false
18 18
	/**
19 19
	 * To CoNLL2009.
20 20
	 *
......
80 80
				writer.write("# newdoc id=$text\n");
81 81
				writer.write("# sent_id=$match\n");
82 82
				
83
				id = 0 // first position in sentence
84
				for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
83
				id = 1 // first position in sentence
84
				for (i = match.start ; i <= match.end ; i++) { // loop n times, n = mathc length
85 85
					
86 86
					if (lemme != null)
87 87
						lemmestr = lemmes.get(iposition);
......
94 94
						posstr = "_";
95 95
					
96 96
					//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
97
					writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\tXmlID="+idsList.get(iposition)+"\n");
97
					writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\tXmlID="+idsList.get(iposition)+"\n");
98 98
					
99 99
					id++; // next word
100 100
					iposition++ // nextposition
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 3540)
9 9

  
10 10
import org.txm.scripts.importer.StaxStackWriter;
11 11
import org.eclipse.ui.part.PageSwitcher
12
import org.txm.core.preferences.TBXPreferences
12 13
import org.txm.importer.xtz.*
13 14

  
14 15
public class XTZDefaultPagerStep {
......
659 660
									endOfLastWord = lastword.subSequence(l-1, l)
660 661
								}
661 662
							
662
								String interpvalue = "- "+anaValues.entrySet().join("\n- ")+"\n- "+wordid
663
							
663
								String interpvalue = null;
664
								def tooltipProperties = pager.project.getEditionDefinition("default").get(TBXPreferences.EDITION_DEFINITION_TOOLTIP_PROPERTIES, "*");
665
								println tooltipProperties
666
								if (tooltipProperties.equals("*")) {
667
									interpvalue = "- "+anaValues.entrySet().join("\n- ")+"\n- "+wordid
668
								} else {
669
									for (String p : tooltipProperties.split(",")) {
670
										interpvalue += "- "+p+"="+anaValues.get(p)+"\n"
671
									}
672
									interpvalue += "- "+wordid
673
								}
674
								
664 675
								if (NoSpaceBefore.contains(wordvalue) ||
665 676
										NoSpaceAfter.contains(lastword) ||
666 677
										wordvalue.startsWith("-") ||
TXM/trunk/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/corpus/MainCorpus.java (revision 3540)
29 29

  
30 30
import java.io.File;
31 31
import java.io.IOException;
32
import java.text.NumberFormat;
32 33
import java.util.ArrayList;
33 34
import java.util.Arrays;
34 35
import java.util.HashMap;
......
368 369
	@Override
369 370
	public String getDetails() {
370 371
		try {
371
			return this.getName() + " T=" + this.getSize();
372
			return this.getName() + " T=" + NumberFormat.getInstance().format(this.getSize());
372 373
		}
373 374
		catch (CqiClientException e) {
374 375
			return "corpus not ready.";
TXM/trunk/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/corpus/Subcorpus.java (revision 3540)
29 29

  
30 30
import java.io.File;
31 31
import java.io.IOException;
32
import java.text.NumberFormat;
32 33
import java.util.ArrayList;
33 34
import java.util.Arrays;
34 35
import java.util.HashMap;
......
765 766
		return textids;
766 767
	}
767 768
	
768
	
769 769
	@Override
770 770
	public String getName() {
771 771
		return this.getCorpusParent().getSimpleName() + TXMPreferences.PARENT_NAME_SEPARATOR + this.getSimpleName();
......
775 775
	public String getDetails() {
776 776
		try {
777 777
			if (hasBeenComputedOnce()) {
778
				return "T " + this.getSize();
778
				return "CQP ID "+this.pID+" T " + NumberFormat.getInstance().format(this.getSize());
779 779
			} else {
780 780
				return this.getName();
781 781
			}
TXM/trunk/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/EditionSection.java (revision 3540)
19 19
import org.eclipse.ui.forms.widgets.TableWrapData;
20 20
import org.eclipse.ui.forms.widgets.TableWrapLayout;
21 21
import org.txm.Toolbox;
22
import org.txm.core.preferences.TBXPreferences;
23
import org.txm.core.preferences.TXMPreferences;
22 24
import org.txm.objects.EditionDefinition;
23 25
import org.txm.objects.Project;
24 26
import org.txm.rcp.editors.imports.ImportFormEditor;
......
51 53
	private Text defaultEditions;
52 54

  
53 55
	private Combo unmanagedElementsPolicyCombo;
56

  
57
	private Text defaultEditionTooltipPropertiesText;
54 58
	
55 59
	/**
56 60
	 * 
......
220 224
			gdata = getTextGridData();
221 225
			gdata.colspan = 2;
222 226
			defaultEditions.setLayoutData(gdata);
227
			
228
			tmpLabel = toolkit.createLabel(sectionClient, "Tooltip properties");
229
			tmpLabel.setToolTipText("comma separated list (comma*=all)");
230
			gdata = getLabelGridData();
231
			gdata.indent = 20;
232
			tmpLabel.setLayoutData(gdata);
233
			
234
			defaultEditionTooltipPropertiesText = toolkit.createText(sectionClient, "", SWT.BORDER); //$NON-NLS-1$
235
			gdata = getTextGridData();
236
			gdata.colspan = 2;
237
			defaultEditionTooltipPropertiesText.setLayoutData(gdata);
223 238
		}
224 239
		
225 240
		//// First draft of interface to add editions to a corpus
......
323 338
			}
324 339
		}
325 340
		
341
		if (defaultEditionTooltipPropertiesText != null) {
342
			defaultEditionTooltipPropertiesText.setText(project.getEditionDefinition("default").get(TBXPreferences.EDITION_DEFINITION_TOOLTIP_PROPERTIES, "*").replace(" ", ""));
343
		}
344
		
326 345
		EditionDefinition edition_params = project.getEditionDefinition("facs"); //$NON-NLS-1$
327 346
		boolean bfe = edition_params.getBuildEdition();
328 347
		if (buildFacsEditionCheckButton != null && edition_params != null) {
......
376 395
				project.setTextualPlan("UnmanagedElementsPolicy", unmanagedElementsPolicyCombo.getText());
377 396
			}
378 397
			
398
			if (defaultEditionTooltipPropertiesText != null) {
399
				project.getEditionDefinition("default").set(TBXPreferences.EDITION_DEFINITION_TOOLTIP_PROPERTIES, defaultEditionTooltipPropertiesText.getText().replace(" ", ""));
400
			}
401
			
379 402
			if (buildFacsEditionCheckButton != null) {
380 403
				
381 404
				String images_directory = ""; //$NON-NLS-1$
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsCoNLLU.java (revision 3540)
66 66
	
67 67
	public static final String ID = ExportCorpusAsCoNLLU.class.getName();
68 68
	
69
	@Option(name = "conlluResultDirectory", usage = "conlluResultDirectory", widget = "Folder", required = true, def = "conllu-result-directory")
70
	File conlluResultDirectory;
69
	@Option(name="outputDirectory", usage="an example file", widget="Folder", required=true, def="outputDirectory")
70
	File outputDirectory;
71 71
	
72
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
73
	String propertiesPrefix;
72
	@Option(name="encoding", usage="sentenceProperty", widget="String", required=true, def="UTF-8")
73
	String encoding = "UTF-8";
74

  
75
	 @Option(name="sentenceStructure", usage="sentenceProperty", widget="String", required=true, def="s")
76
	String sentenceStructure;
74 77
	
75
	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties")
76
	Boolean separator = false;
78
	@Option(name="posProperty", usage="if set posProperty used to fill the UPOS ud property", widget="String", required=true, def="frpos")
79
	String posProperty;
77 80
	
78
	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the CoNLLU corpus", widget = "Boolean", required = true, def = "true")
79
	Boolean insertParagraphs = false;
81
	@Option(name="lemmaProperty", usage="if set lemmaProperty used to fill the LEMMA ud property", widget="String", required=true, def="frlemma")
82
	String lemmaProperty;
80 83
	
81
	@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
82
	Boolean detectGap = false;
83
	
84
	@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options")
85
	Boolean separator3 = false;
86
	
87
	@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the CoNLLU corpus", widget = "Boolean", required = true, def = "true")
88
	Boolean insertNoSpaceAfter = true;
89
	
90
	@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false")
91
	Boolean insertTokenWithoutUdAnnotations;
92
	
93
	// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
94
	@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties")
95
	Boolean separator_properties = false;
96
	
97
	@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
98
	String defaultFormPropertyName;
99
	
100
	@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
101
	String defaultLemmaPropertyName;
102
	
103
	@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
104
	String defaultUposPropertyName;
105
	
106
	@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
107
	String defaultXposPropertyName;
108
	
109
	@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "")
110
	String defaultFeatsPropertyName;
111
	
112
	@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "")
113
	String defaultHeadPropertyName;
114
	
115
	@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "")
116
	String defaultDeprelPropertyName;
117
	
118
	@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "")
119
	String defaultDepsPropertyName;
120
	
121
	@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "")
122
	String defaultMiscPropertyName;
123
	
124
	@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
125
	Boolean separator2 = false;
126
	
127
	@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]")
128
	String openingPunct;
129
	
130 84
	/**
131 85
	 * the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix
132 86
	 */
......
151 105
			return null;
152 106
		}
153 107
		
154
		conlluResultDirectory.mkdirs();
155
		if (conlluResultDirectory == null || !conlluResultDirectory.exists() || !conlluResultDirectory.isDirectory()) {
156
			Log.warning("Error: conllu result directory does not exists: " + conlluResultDirectory);
108
		outputDirectory.mkdirs();
109
		if (outputDirectory == null || !outputDirectory.exists() || !outputDirectory.isDirectory()) {
110
			Log.warning("Error: conllu result directory does not exists: " + outputDirectory);
157 111
			return null;
158 112
		}
159 113
		
......
161 115
		MainCorpus mainCorpus = corpus.getMainCorpus();
162 116
		
163 117
		try {
164
			return exportAnnotationsAsCorpus(mainCorpus, conlluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations,
165
					defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName,
166
					defaultFeatsPropertyName, defaultHeadPropertyName, defaultDeprelPropertyName, defaultDepsPropertyName,
167
					defaultMiscPropertyName,
168
					detectGap, insertParagraphs, insertNoSpaceAfter);
169
		}
170
		catch (Exception e) {
118
			new ToCoNLL2009().process(outputDirectory, mainCorpus, corpus.getStructuralUnit(sentenceStructure), corpus.getProperty("word"), corpus.getProperty(lemmaProperty), corpus.getProperty(posProperty), encoding);
119
		} catch (Exception e) {
171 120
			Log.warning(e);
172 121
			Log.printStackTrace(e);
173 122
		}
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ExportCorpusAsFullCoNLLU.java (revision 3540)
1
// Copyright © 2010-2020 ENS de Lyon., University of Franche-Comté
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.conllu.rcp.commands;
29

  
30
import java.io.File;
31
import java.io.IOException;
32
import java.io.PrintWriter;
33
import java.util.ArrayList;
34
import java.util.HashMap;
35
import java.util.HashSet;
36

  
37
import org.apache.commons.lang.StringUtils;
38
import org.eclipse.core.commands.AbstractHandler;
39
import org.eclipse.core.commands.ExecutionEvent;
40
import org.eclipse.core.commands.ExecutionException;
41
import org.eclipse.jface.viewers.IStructuredSelection;
42
import org.eclipse.osgi.util.NLS;
43
import org.eclipse.ui.handlers.HandlerUtil;
44
import org.kohsuke.args4j.Option;
45
import org.txm.rcp.swt.widget.parameters.ParametersDialog;
46
import org.txm.searchengine.cqp.CQPSearchEngine;
47
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
48
import org.txm.searchengine.cqp.clientExceptions.InvalidCqpIdException;
49
import org.txm.searchengine.cqp.clientExceptions.UnexpectedAnswerException;
50
import org.txm.searchengine.cqp.corpus.CQPCorpus;
51
import org.txm.searchengine.cqp.corpus.MainCorpus;
52
import org.txm.searchengine.cqp.corpus.StructuralUnit;
53
import org.txm.searchengine.cqp.corpus.WordProperty;
54
import org.txm.searchengine.cqp.corpus.query.CQLQuery;
55
import org.txm.searchengine.cqp.serverException.CqiServerError;
56
import org.txm.utils.i18n.LangFormater;
57
import org.txm.utils.io.IOUtils;
58
import org.txm.utils.logger.Log;
59

  
60
/**
61
 * Export the conllu properties and CQP words into a conllu corpus of several files (one per text)
62
 * 
63
 * @author mdecorde.
64
 */
65
public class ExportCorpusAsFullCoNLLU extends AbstractHandler {
66
	
67
	public static final String ID = ExportCorpusAsFullCoNLLU.class.getName();
68
	
69
	@Option(name = "conlluResultDirectory", usage = "conlluResultDirectory", widget = "Folder", required = true, def = "conllu-result-directory")
70
	File conlluResultDirectory;
71
	
72
	@Option(name = "propertiesPrefix", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "ud-")
73
	String propertiesPrefix;
74
	
75
	@Option(name = "separator", usage = "Options", widget = "Separator", required = true, def = "comment properties")
76
	Boolean separator = false;
77
	
78
	@Option(name = "insertParagraphs", usage = "Insert paragraph marks in the CoNLLU corpus", widget = "Boolean", required = true, def = "true")
79
	Boolean insertParagraphs = false;
80
	
81
	@Option(name = "detectGap", usage = "Insert gap comment using the CQP 'gap' property", widget = "Boolean", required = true, def = "true")
82
	Boolean detectGap = false;
83
	
84
	@Option(name = "separator3", usage = "Options", widget = "Separator", required = true, def = "tokens options")
85
	Boolean separator3 = false;
86
	
87
	@Option(name = "insertNoSpaceAfter", usage = "Insert the NoSpaceAfter misc property if not initially in the CoNLLU corpus", widget = "Boolean", required = true, def = "true")
88
	Boolean insertNoSpaceAfter = true;
89
	
90
	@Option(name = "insertTokenWithoutUdAnnotations", usage = "if checked words without ud annotations are exported as well", widget = "Boolean", required = false, def = "false")
91
	Boolean insertTokenWithoutUdAnnotations;
92
	
93
	// "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
94
	@Option(name = "separator_properties", usage = "Options", widget = "Separator", required = true, def = "fixing UD properties")
95
	Boolean separator_properties = false;
96
	
97
	@Option(name = "defaultFormPropertyName", usage = "optional CQP property to fix the missing 'form' ud property", widget = "String", required = false, def = "")
98
	String defaultFormPropertyName;
99
	
100
	@Option(name = "defaultLemmaPropertyName", usage = "optional CQP property to fix the missing 'lemma' ud property", widget = "String", required = false, def = "")
101
	String defaultLemmaPropertyName;
102
	
103
	@Option(name = "defaultUposPropertyName", usage = "optional CQP property to fix the missing 'upos' ud property", widget = "String", required = false, def = "")
104
	String defaultUposPropertyName;
105
	
106
	@Option(name = "defaultXposPropertyName", usage = "optional CQP property to fix the missing 'xpos' ud property", widget = "String", required = false, def = "")
107
	String defaultXposPropertyName;
108
	
109
	@Option(name = "defaultFeatsPropertyName", usage = "optional CQP property to fix the missing 'feats' ud property", widget = "String", required = false, def = "")
110
	String defaultFeatsPropertyName;
111
	
112
	@Option(name = "defaultHeadPropertyName", usage = "optional CQP property to fix the missing 'head' ud property", widget = "String", required = false, def = "")
113
	String defaultHeadPropertyName;
114
	
115
	@Option(name = "defaultDeprelPropertyName", usage = "optional CQP property to fix the missing 'deprel' ud property", widget = "String", required = false, def = "")
116
	String defaultDeprelPropertyName;
117
	
118
	@Option(name = "defaultDepsPropertyName", usage = "optional CQP property to fix the missing 'deps' ud property", widget = "String", required = false, def = "")
119
	String defaultDepsPropertyName;
120
	
121
	@Option(name = "defaultMiscPropertyName", usage = "optional CQP property to fix the missing 'misc' ud property", widget = "String", required = false, def = "")
122
	String defaultMiscPropertyName;
123
	
124
	@Option(name = "separator2", usage = "Options", widget = "Separator", required = true, def = "sentence fix options")
125
	Boolean separator2 = false;
126
	
127
	@Option(name = "openingPunct", usage = "optional prefix for the properties to create", widget = "String", required = true, def = "[\\-–«‘“\\(]")
128
	String openingPunct;
129
	
130
	/**
131
	 * the UD property suffixes, will be used to create the CQP properties like propertiesPrefix + suffix
132
	 */
133
	public static String[] propNames = { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
134
	
135
	/*
136
	 * (non-Javadoc)
137
	 * @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent)
138
	 */
139
	@Override
140
	public Object execute(final ExecutionEvent event) throws ExecutionException {
141
		
142
		IStructuredSelection selection = (IStructuredSelection) HandlerUtil.getCurrentSelection(event);
143
		
144
		Object s = selection.getFirstElement();
145
		if (!(s instanceof MainCorpus)) {
146
			Log.warning("Selection is not a corpus. Aborting.");
147
			return null;
148
		}
149
		
150
		if (!ParametersDialog.open(this)) {
151
			return null;
152
		}
153
		
154
		conlluResultDirectory.mkdirs();
155
		if (conlluResultDirectory == null || !conlluResultDirectory.exists() || !conlluResultDirectory.isDirectory()) {
156
			Log.warning("Error: conllu result directory does not exists: " + conlluResultDirectory);
157
			return null;
158
		}
159
		
160
		CQPCorpus corpus = (CQPCorpus) s;
161
		MainCorpus mainCorpus = corpus.getMainCorpus();
162
		
163
		try {
164
			return exportAnnotationsAsCorpus(mainCorpus, conlluResultDirectory, propertiesPrefix, openingPunct, insertTokenWithoutUdAnnotations,
165
					defaultFormPropertyName, defaultLemmaPropertyName, defaultUposPropertyName, defaultXposPropertyName,
166
					defaultFeatsPropertyName, defaultHeadPropertyName, defaultDeprelPropertyName, defaultDepsPropertyName,
167
					defaultMiscPropertyName,
168
					detectGap, insertParagraphs, insertNoSpaceAfter);
169
		}
170
		catch (Exception e) {
171
			Log.warning(e);
172
			Log.printStackTrace(e);
173
		}
174
		
175
		return null;
176
	}
177
	
178
	/**
179
	 * export the corpus in a directory of conllu files (one per text)
180
	 * 
181
	 * @param mainCorpus
182
	 * @param conlluResultDirectory
183
	 * @param prefix
184
	 * @param openingPunct
185
	 * @param insertTokenWithoutUdAnnotations
186
	 * @param defaultFormPropertyName
187
	 * @param defaultLemmaPropertyName
188
	 * @param defaultUposPropertyName
189
	 * @param defaultXposPropertyName
190
	 * @param detectGap
191
	 * @param insertParagraphs
192
	 * @param insertNoSpaceAfter
193
	 * @return the number of annotation exported
194
	 * @throws UnexpectedAnswerException
195
	 * @throws IOException
196
	 * @throws CqiServerError
197
	 * @throws CqiClientException
198
	 * @throws InvalidCqpIdException
199
	 */
200
	public static int exportAnnotationsAsCorpus(MainCorpus mainCorpus, File conlluResultDirectory, String prefix, String openingPunct, boolean insertTokenWithoutUdAnnotations,
201
			String defaultFormPropertyName, String defaultLemmaPropertyName, String defaultUposPropertyName, String defaultXposPropertyName,
202
			String defaultFeatsPropertyName, String defaultHeadPropertyName, String defaultDeprelPropertyName, String defaultDepsPropertyName,
203
			String defaultMiscPropertyName,
204
			boolean detectGap, boolean insertParagraphs, boolean insertNoSpaceAfter)
205
			throws UnexpectedAnswerException,
206
			IOException,
207
			CqiServerError,
208
			CqiClientException, InvalidCqpIdException {
209
		
210
		if (!conlluResultDirectory.exists()) {
211
			conlluResultDirectory.mkdirs();
212
		}
213
		int numberOfWordsWritten = 0;
214
		int numberOfSentencesWritten = 0;
215
		int numberOfTextsWritten = 0;
216
		
217
		String[] textIds = mainCorpus.getCorpusTextIdsList();
218
		int[] start_limits = mainCorpus.getTextStartLimits();
219
		int[] end_limits = mainCorpus.getTextEndLimits();
220
		
221
		String lang = mainCorpus.getLang();
222
		// HashSet<String> beforeSpacesRules = new HashSet<>(LangFormater.getNoSpaceBefore(mainCorpus.getLang()));
223
		// HashSet<String> afterSpacesRules = new HashSet<>(LangFormater.getNoSpaceAfter(mainCorpus.getLang()));
224
		
225
		for (String p : propNames) {
226
			WordProperty wp = mainCorpus.getProperty(prefix + p);
227
			if (wp == null) {
228
				Log.warning("Error: cannot find the Conllu property: " + prefix + p);
229
				return 0;
230
			}
231
		}
232
		
233
		if (insertTokenWithoutUdAnnotations && (defaultFormPropertyName == null || mainCorpus.getProperty(defaultFormPropertyName) == null)) {
234
			Log.warning("Error: the defaultFormPropertyName parameter needs to be set if insertTokenWithoutUdAnnotations is set to true");
235
			return 0;
236
		}
237
		
238
		for (int iText = 0; iText < start_limits.length; iText++) {
239
			
240
			// Build corpus positions
241
			int[] positions = new int[end_limits[iText] - start_limits[iText] + 1];
242
			int tmp = 0;
243
			for (int n = start_limits[iText]; n <= end_limits[iText]; n++) {
244
				positions[tmp++] = n;
245
			}
246
			numberOfWordsWritten += positions.length;
247
			
248
			// Get UD properties
249
			WordProperty wp;
250
			wp = mainCorpus.getProperty(prefix + "id");
251
			String[] tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
252
			int[] idValues = new int[tmpValues.length];
253
			for (int iId = 0; iId < tmpValues.length; iId++) {
254
				if (tmpValues[iId].length() > 0 && !tmpValues[iId].equals("_") && !tmpValues[iId].equals("__UNDEF__")) {
255
					idValues[iId] = Integer.parseInt(tmpValues[iId]);
256
				}
257
				else {
258
					idValues[iId] = 0;
259
				}
260
			}
261
			tmpValues = null;
262
			
263
			WordProperty formWordProperty = mainCorpus.getProperty(prefix + "form");
264
			String[] formValues = CQPSearchEngine.getCqiClient().cpos2Str(formWordProperty.getQualifiedName(), positions);
265
			fixUNDEFValues(formValues);
266
			
267
			wp = mainCorpus.getProperty(prefix + "lemma");
268
			String[] lemmaValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
269
			fixUNDEFValues(lemmaValues);
270
			
271
			wp = mainCorpus.getProperty(prefix + "upos");
272
			String[] uposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
273
			fixUNDEFValues(uposValues);
274
			
275
			wp = mainCorpus.getProperty(prefix + "xpos");
276
			String[] xposValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
277
			fixUNDEFValues(xposValues);
278
			
279
			wp = mainCorpus.getProperty(prefix + "feats");
280
			String[] featsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
281
			fixUNDEFValues(featsValues);
282
			
283
			wp = mainCorpus.getProperty(prefix + "head");
284
			// String[] headValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
285
			tmpValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
286
			int[] headValues = new int[tmpValues.length];
287
			for (int iId = 0; iId < tmpValues.length; iId++) {
288
				if (tmpValues[iId].length() > 0 && !tmpValues[iId].equals("_") && !tmpValues[iId].equals("__UNDEF__")) {
289
					headValues[iId] = Integer.parseInt(tmpValues[iId]);
290
				}
291
				else {
292
					headValues[iId] = -1;
293
				}
294
			}
295
			tmpValues = null;
296
			
297
			wp = mainCorpus.getProperty(prefix + "deprel");
298
			String[] deprelValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
299
			fixUNDEFValues(deprelValues);
300
			
301
			wp = mainCorpus.getProperty(prefix + "deps");
302
			String[] depsValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
303
			fixUNDEFValues(depsValues);
304
			
305
			wp = mainCorpus.getProperty(prefix + "misc");
306
			String[] miscValues = CQPSearchEngine.getCqiClient().cpos2Str(wp.getQualifiedName(), positions);
307
			fixUNDEFValues(miscValues);
308
			
309
			HashSet<Integer> paragraphsStartPositions = new HashSet<>();
310
			if (insertParagraphs) {
311
				StructuralUnit p_struct = mainCorpus.getStructuralUnit("p");
312
				if (p_struct == null) {
313
					Log.warning(NLS.bind("** insertParagraph parameter is set, but there are no *p* structure (no paragraph) in the {0} CQP corpus. The insertParagraph parameter will be ignored.",
314
							mainCorpus));
315
				}
316
				else {
317
					for (int position : mainCorpus.query(new CQLQuery("<p> [_.text_id=\"" + textIds[iText] + "\"]"), "textParagraphPositions", false).getStarts()) {
318
						paragraphsStartPositions.add(position);
319
					}
320
				}
321
			}
322
			
323
			HashMap<Integer, String> sentidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"sentid");
324
			HashMap<Integer, String> newdocidStartPositions = getNonUNDEFPositionsAndValues(mainCorpus, prefix+"newdocid");
325
			
326
			// build sentence, first pass using UD word sentence positions
327
			ArrayList<ArrayList<Integer>> sentences = new ArrayList<>();
328
			ArrayList<Integer> tmpSentence = new ArrayList<>();
329
			for (int p = 0; p < positions.length; p++) {
330
				// System.out.println("p=" + p + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + " feats="
331
				// + featsValues[p] + " head="
332
				// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]);
333
				if (sentidStartPositions.containsKey(p)) { // new ud sentence
334
					
335
					if (tmpSentence.size() > 0) {
336
						sentences.add(new ArrayList<>(tmpSentence));
337
					}
338
					
339
					// System.out.println("new sentence: " + " id=" + idValues[p] + " form=" + formValues[p] + " lemma=" + lemmaValues[p] + " upos=" + uposValues[p] + " xpos=" + xposValues[p] + "
340
					// feats="
341
					// + featsValues[p] + " head="
342
					// + headValues[p] + " deprel=" + deprelValues[p] + " deps=" + depsValues[p] + " misc=" + miscValues[p]);
343
					tmpSentence.clear();
344
				}
345
				
346
				if (insertTokenWithoutUdAnnotations) {
347
					tmpSentence.add(p); // insert all tokens
348
				}
349
				else if (idValues[p] != 0) {
350
					tmpSentence.add(p); // insert all tokens
351
				}
352
				
353
			}
354
			positions = null; // free memory
355
			
356
			// fixing sentences
357
			for (int s = 0; s < sentences.size(); s++) {
358
				
359
				// fix only ud sentences limits
360
				ArrayList<Integer> sentence = sentences.get(s);
361
				
362
				if (sentidStartPositions.get(sentence.get(0)) == null) {
363
					continue; // this is not a UD sentence
364
				}
365
				
366
				int max = -1;
367
				int imax = 0;
368
				for (int ip = 0; ip < sentence.size(); ip++) {
369
					int p = sentence.get(ip);
370
					if (max < idValues[p]) {
371
						max = idValues[p];
372
						imax = ip;
373
					}
374
				}
375
				
376
				ArrayList<Integer> newSentence = new ArrayList<>();
377
				for (int ip = imax + 1; ip < sentence.size(); ip++) {
378
					newSentence.add(sentence.get(ip));
379
					sentence.remove(ip);
380
					ip--;
381
				}
382
				if (newSentence.size() == 1) { // the new sentence size is 1, resinsert it
383
					sentence.addAll(newSentence);
384
					newSentence.clear();
385
				}
386
				if (newSentence.size() > 0) {
387
					//System.out.println("INSERT " + newSentence);
388
					sentences.add(s + 1, newSentence);
389
				}
390
			}
391
			
392
			if (tmpSentence.size() > 0) { // add last sentence
393
				sentences.add(new ArrayList<>(tmpSentence));
394
			}
395
			
396
			// fixing sentence __NULL__ ud properties
397
			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
398
				ArrayList<Integer> sentence = sentences.get(iSentence);
399
				
400
				int[] sentencePositions = new int[sentence.size()];
401
				for (int p = 0; p < sentence.size(); p++) {
402
					sentencePositions[p] = sentence.get(p);
403
				}
404
				
405
				// get CQP values fixing "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
406
				String[] ids = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("id").getQualifiedName(), sentencePositions);
407
				
408
				String[] words = null;
409
				if (defaultFormPropertyName != null && defaultFormPropertyName.length() > 0) {
410
					words = getDefaultValues(mainCorpus, defaultFormPropertyName, sentencePositions);
411
				}
412
				String[] lemmas = null;
413
				if (defaultLemmaPropertyName != null && defaultLemmaPropertyName.length() > 0) {
414
					lemmas = getDefaultValues(mainCorpus, defaultLemmaPropertyName, sentencePositions);
415
				}
416
				String[] uposs = null;
417
				if (defaultUposPropertyName != null && defaultUposPropertyName.length() > 0) {
418
					uposs = getDefaultValues(mainCorpus, defaultUposPropertyName, sentencePositions);
419
				}
420
				String[] xposs = null;
421
				if (defaultXposPropertyName != null && defaultXposPropertyName.length() > 0) {
422
					xposs = getDefaultValues(mainCorpus, defaultXposPropertyName, sentencePositions);
423
				}
424
				
425
				String[] feats = null;
426
				if (defaultFeatsPropertyName != null && defaultFeatsPropertyName.length() > 0) {
427
					feats = getDefaultValues(mainCorpus, defaultFeatsPropertyName, sentencePositions);
428
				}
429
				String[] heads = null;
430
				if (defaultHeadPropertyName != null && defaultHeadPropertyName.length() > 0) {
431
					heads = getDefaultValues(mainCorpus, defaultHeadPropertyName, sentencePositions);
432
				}
433
				String[] deprels = null;
434
				if (defaultDeprelPropertyName != null && defaultDeprelPropertyName.length() > 0) {
435
					deprels = getDefaultValues(mainCorpus, defaultDeprelPropertyName, sentencePositions);
436
				}
437
				String[] depss = null;
438
				if (defaultDepsPropertyName != null && defaultDepsPropertyName.length() > 0) {
439
					depss = getDefaultValues(mainCorpus, defaultDepsPropertyName, sentencePositions);
440
				}
441
				String[] miscs = null;
442
				if (defaultMiscPropertyName != null && defaultMiscPropertyName.length() > 0) {
443
					miscs = getDefaultValues(mainCorpus, defaultMiscPropertyName, sentencePositions);
444
				}
445
				
446
				// String[] feats = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(featsCorrPropertyName).getQualifiedName(), sentencePositions);
447
				// String[] head = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(headCorrPropertyName).getQualifiedName(), sentencePositions);
448
				// String[] deprel = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
449
				// String[] deps = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(deprelCorrPropertyName).getQualifiedName(), sentencePositions);
450
				
451
				// fix ud properties using CQP values
452
				for (int ip = 0; ip < sentence.size(); ip++) {
453
					
454
					int p = sentence.get(ip);
455
					
456
					// new word
457
					if (miscValues[p].equals("_")) {
458
						miscValues[p] = "XmlId=" + ids[ip];
459
					}
460
					
461
					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
462
					if (words != null && formValues[p].equals("_")) {
463
						formValues[p] = words[ip];
464
					}
465
					if (lemmas != null && lemmaValues[p].equals("_")) {
466
						lemmaValues[p] = lemmas[ip];
467
					}
468
					if (uposs != null && uposValues[p].equals("_")) {
469
						uposValues[p] = uposs[ip];
470
					}
471
					if (xposs != null && xposValues[p].equals("_")) {
472
						xposValues[p] = xposs[ip];
473
					}
474
					if (feats != null && featsValues[p].equals("_")) {
475
						featsValues[p] = feats[ip];
476
					}
477
					if (heads != null && headValues[p] < 0) {
478
						headValues[p] = Integer.parseInt(heads[ip]);
479
					}
480
					if (deprels != null && deprelValues[p].equals("_")) {
481
						deprelValues[p] = deprels[ip];
482
					}
483
					if (depss != null && depsValues[p].equals("_")) {
484
						depsValues[p] = depss[ip];
485
					}
486
					if (miscs != null && miscValues[p].equals("_")) {
487
						miscValues[p] = miscs[ip];
488
					}
489
				}
490
				
491
				if (insertNoSpaceAfter) {
492
					for (int ip = 0; ip < sentence.size(); ip++) { // fix SpaceAfter. !!! this needs to be done after ud properties are fixed
493
						int p = sentence.get(ip);
494
						// insertion activated
495
						if (!miscValues[p].contains("SpaceAfter=")) { // only update if not present
496
							if (LangFormater.isSpaceAfterNotNeeded(formValues[p], lang)) {
497
								miscValues[p] += "|SpaceAfter=No";
498
							}
499
							else if (formValues.length > (p + 1) && LangFormater.isSpaceBeforeNotNeeded(formValues[p + 1], lang)) {
500
								// if next token needs a space before, set SpaceAfter=Yes to the previous token
501
								miscValues[p] += "|SpaceAfter=No";
502
							}
503
						}
504
					}
505
				}
506
				
507
				// fixing sentence punct limits
508
				while (sentence.size() > 0 && iSentence > 0 && formValues[sentence.get(0)].matches("\\p{P}") && !formValues[sentence.get(0)].matches(openingPunct)) {
509
					// System.out.println("FIXING: first non-openingPunct position " + formValues[sentence.get(0)] + " in " + iSentence);
510
					int p2 = sentence.remove(0);
511
					sentences.get(iSentence - 1).add(p2);
512
				}
513
				//
514
				while (sentence.size() > 0 && iSentence + 1 < sentences.size() && formValues[sentence.get(sentence.size() - 1)].matches(openingPunct)) {
515
					// System.out.println("FIXING: last openingPunct position " + formValues[sentence.get(sentence.size() - 1)] + " in " + iSentence);
516
					int p2 = sentence.remove(sentence.size() - 1);
517
					sentences.get(iSentence + 1).add(0, p2);
518
				}
519
				
520
				if (sentence.size() == 0) { // sentence was depleted after fixing it
521
					sentences.remove(iSentence);
522
					iSentence--;
523
					continue;
524
				}
525
			}
526
			
527
			for (int s = 0; s < sentences.size(); s++) {
528
				
529
				// fix only ud sentences limits
530
				ArrayList<Integer> sentence = sentences.get(s);
531
				HashMap<Integer, Integer> oldToNewIds = new HashMap<>();
532
				for (int ip = 0; ip < sentence.size(); ip++) { // computing old to new ids
533
					int p = sentence.get(ip);
534
					
535
					if (idValues[p] != 0) { // store "old id -> new id"
536
						oldToNewIds.put(idValues[p], (ip + 1)); // from 1 to N
537
					}
538
				}
539
				
540
				// fixing head and set missing head to 0 and root
541
				for (int ip = 0; ip < sentence.size(); ip++) {
542
					int p = sentence.get(ip);
543
					
544
					// fixing id value
545
					idValues[p] = (ip + 1);  // from 1 to N
546
					
547
					// fixing head values
548
					if (oldToNewIds.containsKey(headValues[p])) {
549
						headValues[p] = oldToNewIds.get(headValues[p]);
550
					}
551
					else if (headValues[p] != 0) { // new word, set to default values
552
						headValues[p] = 0;
553
						deprelValues[p] = "_";
554
						depsValues[p] = "_";
555
					}
556
				}
557
			}
558
			
559
			// writing sentences
560
			File resultConlluFile = new File(conlluResultDirectory, textIds[iText] + ".conllu");
561
			PrintWriter writer = IOUtils.getWriter(resultConlluFile);
562
			
563
			int iParagraph = 1;
564
			
565
			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
566
				ArrayList<Integer> sentence = sentences.get(iSentence);
567
				
568
				int[] sentencePositions = new int[sentence.size()];
569
				for (int p = 0; p < sentence.size(); p++) {
570
					sentencePositions[p] = sentence.get(p);
571
				}
572
				
573
				String[] gap = null;
574
				if (detectGap && mainCorpus.getProperty("gap") != null) {
575
					gap = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty("gap").getQualifiedName(), sentencePositions);
576
				}
577
				
578
				String[] tokens = new String[sentence.size()];
579
				for (int ip = 0; ip < sentence.size(); ip++) {
580
					tokens[ip] = formValues[sentence.get(ip)];
581
				}
582
				
583
				if (insertNoSpaceAfter) {
584
					writer.println("# text = " + LangFormater.format(StringUtils.join(tokens, " "), mainCorpus.getLang()));
585
				}
586
				else {
587
					writer.println("# text = " + StringUtils.join(tokens, " "));
588
				}
589
				
590
				if (newdocidStartPositions.containsKey(sentence.get(0))) {
591
					writer.println("# newdoc id = " + newdocidStartPositions.get(sentence.get(0)));
592
				}
593
				else {
594
					writer.println("# newdoc id = " + textIds[iText]);
595
				}
596
				
597
				boolean foundSentId = false;
598
				for (int ip : sentence) {
599
					if (!foundSentId && sentidStartPositions.containsKey(ip)) {
600
						writer.println("# sent_id = " + sentidStartPositions.get(ip));
601
						foundSentId = true;
602
					}
603
				}
604
				if (!foundSentId) { // no sent_id found
605
					writer.println("# sent_id = " + textIds[iText] + "-" + (iSentence + 1) + ".new");
606
				}
607
				
608
				if (paragraphsStartPositions.contains(sentence.get(0))) { // paragraphsStartPositions is empty if the injectParagraph option is not set
609
					writer.println("# newpar id = " + iParagraph);
610
					iParagraph++;
611
				}
612
				
613
				for (int ip = 0; ip < sentence.size(); ip++) {
614
					int p = sentence.get(ip);
615
					
616
					// { "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps", "misc" };
617
					writer.println(idValues[p] + "\t" + formValues[p] + "\t" + lemmaValues[p] + "\t" + uposValues[p]
618
							+ "\t" + xposValues[p] + "\t" + featsValues[p] + "\t" + headValues[p] + "\t" + deprelValues[p]
619
							+ "\t" + depsValues[p] + "\t" + miscValues[p]);
620
					
621
					if (gap != null && gap[ip].equals("next")) {
622
						writer.println("# gap");
623
					}
624
				}
625
				writer.println("");
626
				numberOfSentencesWritten++;
627
			}
628
			writer.close();
629
			
630
			System.out.println(" Text done: " + resultConlluFile);
631
			numberOfTextsWritten++;
632
		}
633
		
634
		System.out.println("# words written: " + numberOfWordsWritten);
635
		System.out.println("# sentences written: " + numberOfSentencesWritten);
636
		System.out.println("# texts written: " + numberOfTextsWritten);
637
		
638
		return numberOfWordsWritten;
639
	}
640
	
641
	private static String[] getDefaultValues(MainCorpus mainCorpus, String property, int[] positions) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
642
		
643
		String[] values = CQPSearchEngine.getCqiClient().cpos2Str(mainCorpus.getProperty(property).getQualifiedName(), positions);
644
		for (int iupos = 0; iupos < values.length; iupos++) { // recode the || CQP multiple values to ud multiple values
645
			if (values[iupos].length() > 2 && values[iupos].startsWith("|") && values[iupos].endsWith("|")) {
646
				values[iupos] = values[iupos].substring(1, values[iupos].length() - 1);
647
			}
648
		}
649
		
650
		return values;
651
	}
652
	
653
	private static HashMap<Integer, String> getNonUNDEFPositionsAndValues(MainCorpus mainCorpus, String property) throws UnexpectedAnswerException, IOException, CqiServerError, CqiClientException {
654
		
655
		HashMap<Integer, String> sentidStartPositions = new HashMap<>();
656
		int[] ids = CQPSearchEngine.getCqiClient().regex2Id(mainCorpus.getProperty(property).getQualifiedName(), "(?!__UNDEF__).+");
657
		String[] strs = CQPSearchEngine.getCqiClient().id2Str(mainCorpus.getProperty(property).getQualifiedName(), ids);
658
		for (int iId = 0; iId < ids.length; iId++) {
659
			int id = ids[iId];
660
			int[] pp = CQPSearchEngine.getCqiClient().id2Cpos(mainCorpus.getProperty(property).getQualifiedName(), id);
661
			for (int p : pp) {
662
				sentidStartPositions.put(p, strs[iId]);
663
			}
664
		}
665
		
666
		return sentidStartPositions;
667
	}
668
	
669
	private static void fixUNDEFValues(String[] values) {
670
		
671
		for (int i = 0; i < values.length; i++) {
672
			if (values[i].equals("__UNDEF__") || values[i].equals("") || values[i].equals("|_|")) {
673
				values[i] = "_";
674
			}
675
		}
676
	}
677
}
678

  
679

  
0 680

  
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ToCoNLL2009.java (revision 3540)
1
package org.txm.conllu.rcp.commands;
2

  
3
import java.io.File;
4
import java.io.FileOutputStream;
5
import java.io.IOException;
6
import java.io.OutputStreamWriter;
7
import java.io.Writer;
8
import java.util.List;
9

  
10
import org.txm.searchengine.cqp.clientExceptions.CqiClientException;
11
import org.txm.searchengine.cqp.corpus.CQPCorpus;
12
import org.txm.searchengine.cqp.corpus.CorpusManager;
13
import org.txm.searchengine.cqp.corpus.Property;
14
import org.txm.searchengine.cqp.corpus.StructuralUnit;
15
import org.txm.searchengine.cqp.corpus.StructuralUnitProperty;
16
import org.txm.searchengine.cqp.corpus.WordProperty;
17
import org.txm.searchengine.cqp.corpus.query.CQLQuery;
18
import org.txm.searchengine.cqp.corpus.query.Match;
19
import org.txm.searchengine.cqp.serverException.CqiServerError;
20
import org.txm.utils.ConsoleProgressBar;
21

  
22
public class ToCoNLL2009 {
23
	
24
	boolean debug = false;
25
	/**
26
	 * To CoNLL2009.
27
	 *
28
	 * @param outfile the outfile, where is result is saved
29
	 * @param corpus the corpus or sub-corpus to export
30
	 * @param sentenceUnit the StructuralUnitProperty that identify a sentence
31
	 * @param word the word property
32
	 * @param lemme the lemme property
33
	 * @param pos the pos property
34
	 * @param encoding the encoding of the outfile
35
	 * @return true, if successful
36
	 * @throws CqiServerError 
37
	 * @throws IOException 
38
	 * @throws CqiClientException 
39
	 */
40
	public boolean process(File outdir, CQPCorpus corpus, StructuralUnit sentenceUnit, Property word, Property lemme, Property pos, String encoding) throws CqiClientException, IOException, CqiServerError {
41
		
42
		String sstruct = "";
43
		if (sentenceUnit != null) {
44
			sstruct = "expand to "+sentenceUnit.getName();
45
		}
46
		
47
		String[] texts = corpus.getCorpusTextIdsList();
48
		WordProperty idProperty = corpus.getProperty("id");
49
		
50
		ConsoleProgressBar cpb = new ConsoleProgressBar(texts.length);
51
		
52
		for (String text : texts) {
53
			
54
			cpb.tick();
55
			
56
			List<Match> matches = corpus.query(new CQLQuery("[_.text_id=\""+text+"\"]"+sstruct), "TMPEXPORTCONLL", false).getMatches();
57
			if (debug) System.out.println(matches.size());
58
			int npositions = 0;
59
			for (Match match : matches) npositions += match.size() + 1;
60
			if (debug) System.out.println("npositions="+npositions);
61
			
62
			int[] positions = new int[npositions+1];
63
			int i = 0;
64
			for (Match match : matches) {
65
				for (int p : match.getRange())
66
					positions[i++] = p;
67
			}
68
			
69
			String[] words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
70
			
71
			String[] idsList = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(idProperty.getQualifiedName(), positions);
72
			
73
			String[] lemmes = null;
74
			if (lemme != null) {
75
				lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
76
			}
77
			
78
			String[] poss = null;
79
			if (pos != null) {
80
				poss =  CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
81
			}
82
			
83
			File outfile = new File(outdir, text+".conllu");
84
			Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
85
			
86
			String lemmestr;
87
			String posstr;
88
			int id = 0;
89
			int iposition = 0;
90
			for (Match match : matches) {
91
				
92
				writer.write("\n");
93
				writer.write("# newdoc id="+text+"\n");
94
				writer.write("# sent_id="+match+"\n");
95
				
96
				id = 1; // first position in sentence
97
				for (i = match.getStart() ; i <= match.getEnd() ; i++) { // loop n times, n = mathc length
98
					
99
					if (lemme != null) {
100
						lemmestr = lemmes[iposition];
101
					} else {
102
						lemmestr = "_";
103
					}
104
					
105
					if (pos != null) {
106
						posstr = poss[iposition];
107
					} else {
108
						posstr = "_";
109
					}
110
					
111
					//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
112
					writer.write(""+id+"\t"+words[iposition]+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\tXmlID="+idsList[iposition]+"\n");
113
					
114
					id++; // next word
115
					iposition++; // nextposition
116
				}
117
			}
118
			writer.flush();
119
			writer.close();
120
		}
121
		
122
		cpb.done();
123
		return true;
124
	}
125
	
126
	public static void main(String[] args) throws CqiClientException, IOException, CqiServerError {
127
		
128
		CQPCorpus corpus = null;
129
		File outfile = new File("/home/mdecorde/Bureau/conn_export.tsv");
130
		String encoding = "UTF-8";
131
		
132
		String sentenceProperty = "s_id";
133
		String posProperty = "frpos";
134
		String lemmaProperty = "frlemma";
135
		
136
		String[] split = sentenceProperty.split("_", 2);
137
		StructuralUnit s = null;//corpus.getStructuralUnit(split[0]).getProperty(split[1])
138
		Property word = null;//corpus.getProperty("word")
139
		Property lemma = null;//corpus.getProperty(lemmaProperty)
140
		Property pos = null;//corpus.getProperty(posProperty)
141
		
142
		//		if (s == null) { println "Error sentence property: $sentenceProperty"; return}
143
		//		if (word == null) { println "Error no word property"; return}
144
		//		if (pos == null) { println "Error pos property: $posProperty"; return}
145
		//		if (lemma == null) { println "Error lemma property: $lemmaProperty"; return}
146
		
147
		ToCoNLL2009 processor = new ToCoNLL2009();
148
		processor.process(outfile, corpus, s, word, lemma, pos, encoding);
149
	}
150
}
0 151

  
TXM/trunk/org.txm.conllu.rcp/plugin.xml (revision 3540)
21 21
            id="org.txm.conllu.rcp.commands.ImportCoNLLUAnnotationsFromFile"
22 22
            name="%command.name.1">
23 23
      </command>
24
      <command
25
            categoryId="TreeSearch4TXM.commands.category"
26
            defaultHandler="org.txm.conllu.rcp.commands.ExportCorpusAsFullCoNLLU"
27
            id="org.txm.conllu.rcp.commands.ExportCorpusAsFullCoNLLU"
28
            name="Export as a full CoNLL-U corpus...">
29
      </command>
24 30
   </extension>
25 31
   <extension
26 32
         point="org.eclipse.ui.menus">
......
42 48
         <command
43 49
               commandId="org.txm.conllu.rcp.commands.ExportCorpusAsCoNLLU"
44 50
               icon="icons/functions/UD.png"
45
               label="%command.label.0"
46 51
               style="push">
47 52
            <visibleWhen
48 53
                  checkEnabled="false">
......
51 56
               </reference>
52 57
            </visibleWhen>
53 58
         </command>
59
         <command
60
               commandId="org.txm.conllu.rcp.commands.ExportCorpusAsFullCoNLLU"
61
               icon="icons/functions/UD.png"
62
               style="push">
63
            <visibleWhen
64
                  checkEnabled="false">
65
               <reference
66
                     definitionId="OneMainCorpusSelected">
67
               </reference>
68
            </visibleWhen>
69
         </command>
54 70
      </menuContribution>
55 71
      <menuContribution
56 72
            locationURI="menu:menu.file.import.annotations">
TXM/trunk/org.txm.core/src/java/org/txm/core/preferences/TBXPreferences.java (revision 3540)
75 75
	
76 76
	public static final String EDITION_DEFINITION_IMAGES_DIRECTORY = "edition_definition_images_directory"; //$NON-NLS-1$
77 77
	
78
	public static final String EDITION_DEFINITION_TOOLTIP_PROPERTIES = "edition_definition_tooltip_properties"; //$NON-NLS-1$
79
	
78 80
	/**
79 81
	 * To show or not all nodes in some views, eg. in CorporaView tree.
80 82
	 */
TXM/trunk/org.txm.core/src/java/org/txm/objects/EditionDefinition.java (revision 3540)
62 62
		node.put(key, value);
63 63
	}
64 64
	
65
	public String get(String key) {
66
		return node.get(key, null);
65
	public String get(String key, String defaultValue) {
66
		return node.get(key, defaultValue);
67 67
	}
68 68
	
69
	public final String get(String key) {
70
		return get(key, "");
71
	}
72
	
69 73
	public void copyParametersTo(EditionDefinition anotherEditionDefinition) {
70 74
		anotherEditionDefinition.setPageBreakTag(this.getPageElement());
71 75
		anotherEditionDefinition.setWordsPerPage(this.getWordsPerPage());

Formats disponibles : Unified diff