Révision 3264

tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/StringTokenizer.java (revision 3264)
4 4

  
5 5
public interface StringTokenizer {
6 6
	
7
	List<String> processText(String text);
7
	List<List<String>> processText(String text);
8
	
9
	boolean doSentences();
8 10
}
tmp/org.txm.tokenizer.core/src/org/txm/tokenizer/SimpleStringTokenizer.groovy (revision 3264)
77 77
		this(new TokenizerClasses(lang))
78 78
	}
79 79
	
80
	public boolean doSentences() {
81
		return false;
82
	}
83
	
80 84
	/**
81 85
	 * Instantiates a new simple string tokenizer.
82 86
	 *
......
133 137
	/**
134 138
	 * Process word.
135 139
	 */
136
	public ArrayList<String> processText(String text) {
137
		ArrayList<String> result = new ArrayList<String>();
140
	public ArrayList<ArrayList<String>> processText(String text) {
141
		ArrayList<ArrayList<String>> result = new ArrayList<String>()
142
		ArrayList<String> sresult = new ArrayList<String>()
138 143
		if (regSplitWhiteSpaces != null) {
139 144
			for (String s : regSplitWhiteSpaces.split(text)) {		// separate with unicode white spaces
140 145
				// if (DEBUG){println "process $s"}
141
				result.addAll(iterate(s));
146
				sresult.addAll(iterate(s));
142 147
			}
143 148
		}
144 149
		else {
145
			result.addAll(iterate(text));
150
			sresult.addAll(iterate(text));
146 151
		}
152
		result.add(sresult)
147 153
		return result;
148 154
	}
149 155
	
tmp/org.txm.udpipe.core/src/org/txm/udpipe/core/XMLTXMToUDPipeXMLParser.java (revision 3264)
72 72
	private Sentences sentences;
73 73
	
74 74
	private Sentence sentence;
75

  
76
	private int nSentenceTagFound = 0;
75 77
	
76 78
	/**
77 79
	 * Instantiates a new builds the tt src.
......
112 114
		String form = ""; // the content of the form tag
113 115
		String lastopenlocalname = "";
114 116
		String localname = "";
117
		boolean inS = false;
115 118
		try {
116 119
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
117 120
				switch (event) {
121
					case XMLStreamConstants.PROCESSING_INSTRUCTION:
122
						//System.out.println("PI target="+parser.getPITarget()+" data="+parser.getPIData());
123
						if (!inS && "txm".equals(parser.getPITarget()) && "</s>".equals(parser.getPIData())) {
124
							if (sentence != null && sentence.getWords().size() > 0) {
125
								sentences.add(sentence);
126
							}
127
							sentence = new Sentence();
128
							nSentenceTagFound ++;
129
						}
130
						break;
118 131
					case XMLStreamConstants.START_ELEMENT:
119 132
						localname = parser.getLocalName();
120 133
						
......
144 157
									sentences.add(sentence);
145 158
								}
146 159
								sentence = new Sentence();
160
								nSentenceTagFound ++;
161
								inS = true;
147 162
								break;
148 163
						}
149 164
						break;
......
166 181
								break;
167 182
							
168 183
							case "s":
184
								inS = false;
169 185
								break;
170 186
						}
171 187
						break;
......
194 210
		return true;
195 211
	}
196 212
	
213
	public int getNSentenceTagFound() {
214
		return nSentenceTagFound;
215
	}
216
	
197 217
	/**
198 218
	 * The main method.
199 219
	 *
tmp/org.txm.udpipe.core/src/org/txm/udpipe/core/UDPipeJavaUtils.java (revision 3264)
13 13
import cz.cuni.mff.ufal.udpipe.OutputFormat;
14 14
import cz.cuni.mff.ufal.udpipe.Sentence;
15 15
import cz.cuni.mff.ufal.udpipe.Sentences;
16
import cz.cuni.mff.ufal.udpipe.Version;
16 17
import cz.cuni.mff.ufal.udpipe.Word;
17 18
import cz.cuni.mff.ufal.udpipe.Words;
18 19

  
......
38 39
		
39 40
		for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
40 41
			Sentence sent = sentences.get(iSentence);
42
			
43
			model.parse(sent, "");
41 44
			model.tag(sent, "");
42
			model.parse(sent, "");
45
//			System.out.println(toString(sent));
43 46
		}
44 47
		
45 48
		return sentences;
......
123 126
	}
124 127
	
125 128
	public static void main(String[] args) {
126
		try {
127
			toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"),
128
					"Et un petit test... En deux phrases ? ou trois.");
129
//		try {
130
//			toConnluFile(new File("/tmp/result.connlu"), new File("/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe"),
131
//					"Et un petit test... En deux phrases ? ou trois.");
132
//		}
133
//		catch (UnsupportedEncodingException | FileNotFoundException e) {
134
//			// TODO Auto-generated catch block
135
//			e.printStackTrace();
136
//		}
137
		UDPipeEngine.initializeUDLib(new File("/home/mdecorde/workspace047/org.txm.libs.udpipe"));
138
		System.out.println("VERSION: "+Version.current().toString());
139
		// /usr/lib/UDPipe/models/fr.udpipe
140
		// /home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe
141
		for (String p : new String[] {"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-gsd-ud-2.4-190531.udpipe",
142
				"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-partut-ud-2.4-190531.udpipe",
143
				"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-sequoia-ud-2.4-190531.udpipe",
144
				"/home/mdecorde/SOFTWARE/udpipe-1.2.0-bin/bin-linux64/french-spoken-ud-2.4-190531.udpipe"}) {
145
			System.out.println("M="+p);
146
		process(p, "Et un petit test... En deux phrases ? ou trois.");
129 147
		}
130
		catch (UnsupportedEncodingException | FileNotFoundException e) {
131
			// TODO Auto-generated catch block
132
			e.printStackTrace();
133
		}
134 148
	}
135 149
	
136 150
	public static void processSentences(String modelPath, Sentences sentences) {
......
138 152
	}
139 153
	
140 154
	public static void processSentences(Model model, Sentences sentences) {
141
		System.out.println("Model: " + model);
155
		//System.out.println("Processing sent ("+sentences.size()+") with model: " + model);
142 156
		for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
143 157
			Sentence sent = sentences.get(iSentence);
144
			System.out.println(sent.getWords().size());
145
			// model.tag(sent, "");
158
			
159
			model.tag(sent, "");
160
			model.parse(sent, "");
161
			//System.out.println(toString(sent));
146 162
		}
147 163
	}
148 164
	
......
155 171
	 */
156 172
	public static HashMap<String, String> wordToHashMap(Word word, String prefix) {
157 173
		if (prefix == null) prefix = "";
174
		
158 175
		HashMap<String, String> properties = new HashMap<>();
159 176
		properties.put(prefix + "id", Integer.toString(word.getId()));
160 177
		properties.put(prefix + "form", word.getForm());
......
166 183
		properties.put(prefix + "deprel", word.getDeprel());
167 184
		properties.put(prefix + "deps", word.getDeps());
168 185
		properties.put(prefix + "misc", word.getMisc());
169
		return null;
186
		
187
		return properties;
170 188
	}
171 189
}
tmp/org.txm.udpipe.core/src/org/txm/udpipe/core/UDStringTokenizer.java (revision 3264)
18 18
	
19 19
	protected InputFormat tokenizer;
20 20
	
21
	
21 22
	public UDStringTokenizer(String lang) {
22 23
		File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox
23 24
		File modelFile = new File(modelsDirectory, lang + ".udpipe");
......
29 30
	}
30 31
	
31 32
	@Override
32
	public List<String> processText(String text) {
33
		ArrayList<String> result = new ArrayList<>();
33
	public List<List<String>> processText(String text) {
34
		ArrayList<List<String>> result = new ArrayList<>();
34 35
		
36
		
35 37
		tokenizer.setText(text);
36 38
		Sentence sent = new Sentence();
37 39
		while (tokenizer.nextSentence(sent)) {
40
			List<String> sresult = new ArrayList<>();
38 41
			Words words = sent.getWords();
39 42
			for (int iWord = 0; iWord < words.size(); iWord++) {
40
				result.add(words.get(iWord).getForm());
43
				sresult.add(words.get(iWord).getForm());
41 44
			}
45
			if (sresult.size() > 0) {
46
				sresult.remove(0); // remove the <root> element
47
			}
48
			if (sresult.size() > 0) {
49
				result.add(sresult);
50
			}
42 51
		}
43
		if (result.size() > 0) {
44
			result.remove(0); // remove the <root> element
45
		}
52
		
46 53
		return result;
47 54
	}
55

  
56
	@Override
57
	public boolean doSentences() {
58
		
59
		return true;
60
	}
48 61
}
tmp/org.txm.udpipe.core/src/org/txm/udpipe/core/UDPipeEngine.java (revision 3264)
20 20
import org.txm.utils.logger.Log;
21 21
import org.txm.xml.xmltxm.XMLTXMWordPropertiesInjection;
22 22

  
23
import cz.cuni.mff.ufal.udpipe.InputFormat;
24
import cz.cuni.mff.ufal.udpipe.Model;
23 25
import cz.cuni.mff.ufal.udpipe.Sentence;
24 26
import cz.cuni.mff.ufal.udpipe.Sentences;
25 27
import cz.cuni.mff.ufal.udpipe.Version;
......
45 47
		Bundle bundle = BundleUtils.getBundle("org.txm.libs.udpipe");
46 48
		File bundleDir = BundleUtils.getBundleFile("org.txm.libs.udpipe");
47 49
		if (bundleDir.isDirectory()) {
48
			if (OSDetector.isFamilyWindows()) {
49
				libFile = new File(bundleDir, "lib/libudpipe_java.dll");
50
			}
51
			else if (OSDetector.isFamilyMac()) {
52
				libFile = new File(bundleDir, "lib/libudpipe_java.dylib");
53
			}
54
			else if (OSDetector.isFamilyUnix()) {
55
				libFile = new File(bundleDir, "lib/libudpipe_java.so");
56
			}
57
			udpipe_java.setLibraryPath(libFile.getAbsolutePath());
50
			libFile = initializeUDLib(bundleDir);
58 51
		}
59 52
		else {
60 53
			URL entry = bundle.getEntry("lib/libudpipe_java.so");
......
64 57
		return getDetails() != null;
65 58
	}
66 59
	
60
	public static File initializeUDLib(File bundleDir) {
61
		File libFile;
62
		if (OSDetector.isFamilyWindows()) {
63
			libFile = new File(bundleDir, "lib/libudpipe_java.dll");
64
		}
65
		else if (OSDetector.isFamilyMac()) {
66
			libFile = new File(bundleDir, "lib/libudpipe_java.dylib");
67
		}
68
		else {
69
			libFile = new File(bundleDir, "lib/libudpipe_java.so");
70
		}
71
		udpipe_java.setLibraryPath(libFile.getAbsolutePath());
72
		return libFile;
73
	}
74
	
67 75
	@Override
68 76
	public StringTokenizer getStringTokenizer(String lang) throws Exception {
69 77
		return new UDStringTokenizer(lang);
......
150 158
		
151 159
		File modelsDirectory = new File(UDPipePreferences.getInstance().getString(UDPipePreferences.MODELS_PATH)); // default models directory is set in the Toolbox
152 160
		File modelFile = new File(modelsDirectory, lang + ".udpipe");
161
		//System.out.println("model="+modelFile.getAbsolutePath());
153 162
		
154 163
		try {
155 164
			// get words
......
160 169
			}
161 170
			
162 171
			Sentences sentences = wparser.getSentences();
172
						
163 173
			// System.out.println("SENTENCES PARSED: " + UDPipeJavaUtils.toString(sentences));
164 174
			// tag
165 175
			UDPipeJavaUtils.processSentences(modelFile.getAbsolutePath(), sentences);
......
168 178
			
169 179
			// update XML-TXM files
170 180
			XMLTXMWordPropertiesInjection injector = new XMLTXMWordPropertiesInjection(xmlFile);
181
			HashMap<String, HashMap<String, String>> rules = new HashMap<>();
171 182
			for (int iSentence = 0; iSentence < sentences.size(); iSentence++) {
172 183
				Sentence sentence = sentences.get(iSentence);
173 184
				Words words = sentence.getWords();
......
181 192
					int idx2 = misc.indexOf("|", idx + 6);
182 193
					if (idx2 < 0) idx2 = misc.length();
183 194
					String id = misc.substring(idx + 6, idx2);
184
					HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "ud-");
185
					injector.addProperty(id, properties);
195
					if (id != null && id.length() > 0) {
196
						HashMap<String, String> properties = UDPipeJavaUtils.wordToHashMap(word, "#ud-");
197
						rules.put(id, properties);
198
					}
186 199
				}
187 200
			}
188 201
			File outFile = new File(binaryCorpusDirectory, xmlFile.getName() + ".tmp");
202
			injector.setProperties(rules);
189 203
			if (injector.process(outFile) && outFile.exists()) {
190 204
				xmlFile.delete();
191 205
				outFile.renameTo(xmlFile);

Formats disponibles : Unified diff