Révision 3539

TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/export/conll2009/ToCoNLL2009.groovy (revision 3539)
10 10
import org.txm.searchengine.cqp.CqpDataProxy;
11 11
import org.txm.searchengine.cqp.corpus.*
12 12
import org.txm.searchengine.cqp.corpus.query.CQLQuery
13
import org.txm.utils.ConsoleProgressBar
13 14

  
14 15
public class ToCoNLL2009 {
15

  
16
	
16 17
	boolean debug = true
17 18
	/**
18 19
	 * To CoNLL2009.
......
26 27
	 * @param encoding the encoding of the outfile
27 28
	 * @return true, if successful
28 29
	 */
29
	public boolean process(File outfile, CQPCorpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding) {
30
	public boolean process(File outdir, CQPCorpus corpus, StructuralUnitProperty sentenceUnit, Property word, Property lemme, Property pos, String encoding) {
30 31
		
31
		def matches = corpus.query(new CQLQuery("<s> []+ </s>"), "TMPEXPORTCONLL", false).getMatches()
32
		if (debug) println matches.size()
33
		int npositions = 0;
34
		for (def match : matches) npositions += match.size() +1
35
		if (debug) println "npositions= $npositions"
36

  
37
		int[] positions = new int[npositions+1]
38
		int i = 0;
39
		for (def match : matches) {
40
			for (int p : match.getRange())
41
				positions[i++] = p
42
		}
43

  
44
		List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
45

  
46
		List<String> lemmes = null;
47
		if (lemme != null) {
48
			lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
49
		}
50

  
51
		List<String> poss = null;
52
		if (pos != null) {
53
			poss =  CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
54
		}
55

  
56
		Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
57

  
58
		String lemmestr;
59
		String posstr;
60
		int id = 0;
61
		int iposition = 0;
62
		for (def match : matches) {
63
			id = 0 // first position in sentence
64
			for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
65

  
66
				if (lemme != null)
67
					lemmestr = lemmes.get(iposition);
68
				else
69
					lemmestr = "_";
70

  
71
				if (pos != null)
72
					posstr = poss.get(iposition);
73
				else
74
					posstr = "_";
75

  
76
				//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
77
				writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
78

  
79
				id++; // next word
80
				iposition++ // nextposition
32
		String sstruct = sentenceUnit.getStructuralUnit().getName()
33
		
34
		def texts = corpus.getCorpusTextIdsList();
35
		def idProperty = corpus.getProperty("id")
36
		
37
		ConsoleProgressBar cpb = new ConsoleProgressBar(texts.size());
38
		
39
		for (def text : texts) {
40
			
41
			cpb.tick()
42
			
43
			def matches = corpus.query(new CQLQuery("[_.text_id=\"$text\"] expand to $sstruct"), "TMPEXPORTCONLL", false).getMatches()
44
			if (debug) println matches.size()
45
			int npositions = 0;
46
			for (def match : matches) npositions += match.size() +1
47
			if (debug) println "npositions= $npositions"
48
			
49
			int[] positions = new int[npositions+1]
50
			int i = 0;
51
			for (def match : matches) {
52
				for (int p : match.getRange())
53
					positions[i++] = p
81 54
			}
55
			
56
			List<String> words = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(word.getQualifiedName(), positions);
57
			
58
			List<String> idsList = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(idProperty.getQualifiedName(), positions);
59
			
60
			List<String> lemmes = null;
61
			if (lemme != null) {
62
				lemmes = CorpusManager.getCorpusManager().getCqiClient().cpos2Str(lemme.getQualifiedName(), positions);
63
			}
64
			
65
			List<String> poss = null;
66
			if (pos != null) {
67
				poss =  CorpusManager.getCorpusManager().getCqiClient().cpos2Str(pos.getQualifiedName(), positions);
68
			}
69
			
70
			File outfile = new File(outdir, text+".conllu")
71
			Writer writer = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); //$NON-NLS-1$;
72
			
73
			String lemmestr;
74
			String posstr;
75
			int id = 0;
76
			int iposition = 0;
77
			for (def match : matches) {
78
				
79
				writer.write("\n");
80
				writer.write("# newdoc id=$text\n");
81
				writer.write("# sent_id=$match\n");
82
				
83
				id = 0 // first position in sentence
84
				for (i = match.start ; i < match.end ; i++) { // loop n times, n = mathc length
85
					
86
					if (lemme != null)
87
						lemmestr = lemmes.get(iposition);
88
					else
89
						lemmestr = "_";
90
					
91
					if (pos != null)
92
						posstr = poss.get(iposition);
93
					else
94
						posstr = "_";
95
					
96
					//print(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\n");
97
					writer.write(""+id+"\t"+words.get(iposition)+"\t"+lemmestr+"\t_\t"+posstr+"\t_\t_\t_\t_\t_\t_\t_\t_\t_\tXmlID="+idsList.get(iposition)+"\n");
98
					
99
					id++; // next word
100
					iposition++ // nextposition
101
				}
102
			}
103
			writer.flush();
104
			writer.close();
82 105
		}
83
		writer.flush();
84
		writer.close();
85

  
106
		
107
		cpb.done()
86 108
		return true;
87 109
	}
88

  
110
	
89 111
	public static void main(String[] args) {
90 112
		CQPCorpus corpus = null
91 113
		File outfile = new File("/home/mdecorde/Bureau/conn_export.tsv")
92 114
		String encoding = "UTF-8"
93

  
115
		
94 116
		String sentenceProperty = "s_id"
95 117
		String posProperty = "frpos"
96 118
		String lemmaProperty = "frlemma"
97

  
98

  
119
		
120
		
99 121
		def split = sentenceProperty.split("_", 2)
100 122
		StructuralUnitProperty s = null;//corpus.getStructuralUnit(split[0]).getProperty(split[1])
101 123
		Property word = null;//corpus.getProperty("word")
102 124
		Property lemma = null;//corpus.getProperty(lemmaProperty)
103 125
		Property pos = null;//corpus.getProperty(posProperty)
104

  
126
		
105 127
		//		if (s == null) { println "Error sentence property: $sentenceProperty"; return}
106 128
		//		if (word == null) { println "Error no word property"; return}
107 129
		//		if (pos == null) { println "Error pos property: $posProperty"; return}
108 130
		//		if (lemma == null) { println "Error lemma property: $lemmaProperty"; return}
109

  
110

  
131
		
132
		
111 133
		ToCoNLL2009 processor = new ToCoNLL2009()
112 134
		processor.process(outfile, corpus, s, word, lemma, pos, encoding)
113 135
	}
TXM/trunk/org.txm.conllu.rcp/src/org/txm/conllu/rcp/commands/ImportCoNLLUAnnotationsFromDirectory.java (revision 3539)
52 52
import org.txm.searchengine.cqp.corpus.CQPCorpus;
53 53
import org.txm.searchengine.cqp.corpus.MainCorpus;
54 54
import org.txm.searchengine.cqp.serverException.CqiServerError;
55
import org.txm.utils.ConsoleProgressBar;
55 56
import org.txm.utils.logger.Log;
56 57

  
57 58
/**
......
153 154
	public static int importAnnotations(MainCorpus mainCorpus, File conlluDirectory, String propertiesPrefix, Boolean normalizeWordIds, 
154 155
			Set<String> headPropertiesToProject, Set<String> depsPropertiesToProject, Set<String> udPropertiesToImport) throws IOException, CqiServerError, CqiClientException, XMLStreamException {
155 156
		
156
		Log.info(TXMCoreMessages.bind("Importing CONLL-u annotations of {0} in {1} using the ''{2}'' prefix...", conlluDirectory, mainCorpus, propertiesPrefix));
157
		Log.info(TXMCoreMessages.bind("Importing CONLL-U annotations of {0} in {1} using the ''{2}'' prefix...", conlluDirectory, mainCorpus, propertiesPrefix));
157 158
		
158 159
		File[] files = conlluDirectory.listFiles(new FileFilter() {
159 160
			@Override
......
164 165
		
165 166
		int nTextProcessed = 0;
166 167
		int nWordsInserted = 0;
168
		ConsoleProgressBar cpb = new ConsoleProgressBar(files.length);
167 169
		for (File coonluFile : files) {
168
			
170
			cpb.tick();
169 171
			nWordsInserted += ImportCoNLLUAnnotations._importAnnotations(coonluFile, mainCorpus, propertiesPrefix, null, normalizeWordIds, headPropertiesToProject, depsPropertiesToProject, udPropertiesToImport);
170 172
			nTextProcessed++;
171 173
		}
174
		cpb.done();
172 175
		
173 176
		if (nTextProcessed == 0) {
174 177
			Log.warning("** No text to process. Aborting.");
TXM/trunk/org.txm.connlu.core/src/org/txm/conllu/core/function/ImportCoNLLUAnnotations.java (revision 3539)
61 61
			
62 62
			if (line.startsWith("#")) {
63 63
				if (line.startsWith("# sent_id = ")) {
64
					
65
					// write previous sentence word properties
66
					if (sentenceProperties.size() > 0) {
67
						storeSentenceProperties(sentenceProperties, processor, propertiesPrefix, headPropertiesToProject, depsPropertiesToProject, udPropertiesToImport);
68
					}
69
					
70 64
					sent_id = line.substring(12).trim();
71
					
72 65
				}
73 66
				else if (line.startsWith("# newdoc id = ")) {
74 67
					newdoc_id = line.substring(14).trim();
......
91 84
				continue; // comment
92 85
			}
93 86
			
87
			if (split[0].equals("1")) { // a new sentence begins, write the previous if any
88
				// write previous sentence word properties
89
				if (sentenceProperties.size() > 0) {
90
					storeSentenceProperties(sentenceProperties, processor, propertiesPrefix, headPropertiesToProject, depsPropertiesToProject, udPropertiesToImport);
91
				}
92
			}
93
			
94 94
			String misc = split[9];
95 95
			String[] miscValues = misc.split("\\|");
96 96
			String wId = null;

Formats disponibles : Unified diff