Révision 1204
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/BuildTTSrc.groovy (revision 1204) | ||
---|---|---|
2 | 2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
3 | 3 |
// Lyon 2, University of Franche-Comté, University of Nice |
4 | 4 |
// Sophia Antipolis, University of Paris 3. |
5 |
//
|
|
5 |
// |
|
6 | 6 |
// The TXM platform is free software: you can redistribute it |
7 | 7 |
// and/or modify it under the terms of the GNU General Public |
8 | 8 |
// License as published by the Free Software Foundation, |
9 | 9 |
// either version 2 of the License, or (at your option) any |
10 | 10 |
// later version. |
11 |
//
|
|
11 |
// |
|
12 | 12 |
// The TXM platform is distributed in the hope that it will be |
13 | 13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
14 | 14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
15 | 15 |
// PURPOSE. See the GNU General Public License for more |
16 | 16 |
// details. |
17 |
//
|
|
17 |
// |
|
18 | 18 |
// You should have received a copy of the GNU General |
19 | 19 |
// Public License along with the TXM platform. If not, see |
20 | 20 |
// http://www.gnu.org/licenses. |
21 |
//
|
|
22 |
//
|
|
23 |
//
|
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 | 24 |
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $ |
25 | 25 |
// $LastChangedRevision: 3400 $ |
26 |
// $LastChangedBy: mdecorde $
|
|
26 |
// $LastChangedBy: mdecorde $ |
|
27 | 27 |
// |
28 | 28 |
package org.txm.importer.xmltxm |
29 | 29 |
|
... | ... | |
46 | 46 |
*/ |
47 | 47 |
|
48 | 48 |
public class BuildTTSrc { |
49 |
|
|
49 |
|
|
50 | 50 |
/** The url. */ |
51 | 51 |
private def url; |
52 |
|
|
52 |
|
|
53 | 53 |
/** The input data. */ |
54 | 54 |
private def inputData; |
55 |
|
|
55 |
|
|
56 | 56 |
/** The factory. */ |
57 | 57 |
private def factory; |
58 |
|
|
58 |
|
|
59 | 59 |
/** The parser. */ |
60 | 60 |
private XMLStreamReader parser; |
61 |
|
|
61 |
|
|
62 | 62 |
/** The output. */ |
63 | 63 |
private BufferedWriter output; |
64 |
|
|
64 |
|
|
65 | 65 |
/** |
66 | 66 |
* Instantiates a new builds the tt src. |
67 | 67 |
* uses XML-TXM V2 |
... | ... | |
74 | 74 |
inputData = url.openStream(); |
75 | 75 |
factory = XMLInputFactory.newInstance(); |
76 | 76 |
parser = factory.createXMLStreamReader(inputData); |
77 |
|
|
77 |
|
|
78 | 78 |
} catch (XMLStreamException ex) { |
79 | 79 |
System.out.println(ex); |
80 | 80 |
} catch (IOException ex) { |
81 | 81 |
System.out.println("IOException while parsing "); |
82 | 82 |
} |
83 | 83 |
} |
84 |
|
|
84 |
|
|
85 | 85 |
/** |
86 | 86 |
* Creates the output. |
87 | 87 |
* |
... | ... | |
99 | 99 |
return false; |
100 | 100 |
} |
101 | 101 |
} |
102 |
|
|
102 |
|
|
103 | 103 |
/** |
104 | 104 |
* Process. |
105 | 105 |
* |
106 | 106 |
* @param outfile the outfile |
107 |
* @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found
|
|
107 |
* @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found
|
|
108 | 108 |
* @return true, if successful |
109 | 109 |
*/ |
110 | 110 |
public boolean process(File outfile, String formtype) { |
111 | 111 |
if (!createOutput(outfile)) |
112 | 112 |
return false; |
113 |
|
|
113 |
|
|
114 |
boolean inW = false |
|
114 | 115 |
boolean flagform = false; // to catch the content of the form tag |
115 | 116 |
boolean firstform = false; // to know if its the first form of the w element |
116 | 117 |
String form = ""; // the content of the form tag |
... | ... | |
122 | 123 |
switch (event) { |
123 | 124 |
case XMLStreamConstants.START_ELEMENT: |
124 | 125 |
localname = parser.getLocalName(); |
126 |
|
|
125 | 127 |
switch (localname) { |
126 | 128 |
case "w": |
127 |
//firstform = true; |
|
129 |
//firstform = true; |
|
130 |
inW = true |
|
128 | 131 |
break; |
129 | 132 |
case "form": |
130 |
// if (firstform) { |
|
131 |
// if (formtype != null) { |
|
132 |
// if(parser.getAttributeCount() > 0 |
|
133 |
// && parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type |
|
134 |
// flagform = true; |
|
135 |
// } |
|
136 |
// else |
|
133 |
if (inW) { |
|
134 |
// if (firstform) { |
|
135 |
// if (formtype != null) { |
|
136 |
// if(parser.getAttributeCount() > 0 |
|
137 |
// && parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type |
|
138 |
// flagform = true; |
|
139 |
// } |
|
140 |
// else |
|
137 | 141 |
flagform = true; |
138 | 142 |
form = ""; |
139 | 143 |
firstform = false; |
140 |
//} |
|
144 |
//} |
|
145 |
} |
|
141 | 146 |
break; |
142 | 147 |
case "s": // TreeTagger can use s tags |
143 | 148 |
buffer.append("<s>\n"); |
... | ... | |
147 | 152 |
case XMLStreamConstants.END_ELEMENT: |
148 | 153 |
localname = parser.getLocalName(); |
149 | 154 |
switch (localname) { |
155 |
case "w": |
|
156 |
inW = false |
|
157 |
break |
|
150 | 158 |
case "form": |
151 |
flagform = false; |
|
152 |
form = form.trim() |
|
153 |
if (form.length() == 0) buffer.append("__EMPTY__\n"); |
|
154 |
else buffer.append(form.replace("\n", "").replace("<", "<")+ "\n"); |
|
155 |
//buffer.append(form+ "\n"); // its a txt file no need to use entities |
|
159 |
if (inW) { // ensure to process a form inside a w |
|
160 |
flagform = false; |
|
161 |
form = form.trim() |
|
162 |
if (form.length() == 0) buffer.append("__EMPTY__\n"); |
|
163 |
else buffer.append(form.replace("\n", "").replace("<", "<")+ "\n"); |
|
164 |
//buffer.append(form+ "\n"); // its a txt file no need to use entities |
|
165 |
} |
|
156 | 166 |
break; |
157 |
|
|
167 |
|
|
158 | 168 |
case "s": |
159 | 169 |
buffer.append("</s>\n"); |
160 | 170 |
break; |
161 | 171 |
} |
162 | 172 |
break; |
163 |
|
|
173 |
|
|
164 | 174 |
case XMLStreamConstants.CHARACTERS: |
165 | 175 |
if (flagform) { |
166 | 176 |
if (parser.getText().length() > 0) |
... | ... | |
181 | 191 |
} catch (Exception ex) { |
182 | 192 |
System.out.println(ex); |
183 | 193 |
return false; |
184 |
}
|
|
185 |
|
|
194 |
} |
|
195 |
|
|
186 | 196 |
return true; |
187 | 197 |
} |
188 |
|
|
198 |
|
|
189 | 199 |
/** |
190 | 200 |
* The main method. |
191 | 201 |
* |
192 | 202 |
* @param args the arguments |
193 | 203 |
*/ |
194 | 204 |
public static void main(String[] args) { |
195 |
|
|
205 |
|
|
196 | 206 |
String rootDir = "~/xml/rgaqcj/"; |
197 | 207 |
// new File(rootDir+"/identity/").mkdir(); |
198 |
|
|
208 |
|
|
199 | 209 |
ArrayList<String> milestones = new ArrayList<String>();// the tags who |
200 | 210 |
// you want them |
201 | 211 |
// to stay |
... | ... | |
204 | 214 |
milestones.add("pb"); |
205 | 215 |
milestones.add("lb"); |
206 | 216 |
milestones.add("catRef"); |
207 |
|
|
217 |
|
|
208 | 218 |
File srcfile = new File(rootDir + "anainline/", "roland-p5.xml"); |
209 | 219 |
File resultfile = new File(rootDir + "ttsrc/", "roland-p5.tt"); |
210 | 220 |
println("build ttsrc file : " + srcfile + " to : " + resultfile); |
211 |
|
|
221 |
|
|
212 | 222 |
def builder = new BuildTTSrc(srcfile.toURL(), milestones); |
213 | 223 |
builder.process(resultfile); |
214 |
|
|
224 |
|
|
215 | 225 |
return; |
216 | 226 |
} |
217 |
|
|
227 |
|
|
218 | 228 |
} |
Formats disponibles : Unified diff