Révision 1204
| tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/BuildTTSrc.groovy (revision 1204) | ||
|---|---|---|
| 2 | 2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
| 3 | 3 |
// Lyon 2, University of Franche-Comté, University of Nice |
| 4 | 4 |
// Sophia Antipolis, University of Paris 3. |
| 5 |
//
|
|
| 5 |
// |
|
| 6 | 6 |
// The TXM platform is free software: you can redistribute it |
| 7 | 7 |
// and/or modify it under the terms of the GNU General Public |
| 8 | 8 |
// License as published by the Free Software Foundation, |
| 9 | 9 |
// either version 2 of the License, or (at your option) any |
| 10 | 10 |
// later version. |
| 11 |
//
|
|
| 11 |
// |
|
| 12 | 12 |
// The TXM platform is distributed in the hope that it will be |
| 13 | 13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
| 14 | 14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
| 15 | 15 |
// PURPOSE. See the GNU General Public License for more |
| 16 | 16 |
// details. |
| 17 |
//
|
|
| 17 |
// |
|
| 18 | 18 |
// You should have received a copy of the GNU General |
| 19 | 19 |
// Public License along with the TXM platform. If not, see |
| 20 | 20 |
// http://www.gnu.org/licenses. |
| 21 |
//
|
|
| 22 |
//
|
|
| 23 |
//
|
|
| 21 |
// |
|
| 22 |
// |
|
| 23 |
// |
|
| 24 | 24 |
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $ |
| 25 | 25 |
// $LastChangedRevision: 3400 $ |
| 26 |
// $LastChangedBy: mdecorde $
|
|
| 26 |
// $LastChangedBy: mdecorde $ |
|
| 27 | 27 |
// |
| 28 | 28 |
package org.txm.importer.xmltxm |
| 29 | 29 |
|
| ... | ... | |
| 46 | 46 |
*/ |
| 47 | 47 |
|
| 48 | 48 |
public class BuildTTSrc {
|
| 49 |
|
|
| 49 |
|
|
| 50 | 50 |
/** The url. */ |
| 51 | 51 |
private def url; |
| 52 |
|
|
| 52 |
|
|
| 53 | 53 |
/** The input data. */ |
| 54 | 54 |
private def inputData; |
| 55 |
|
|
| 55 |
|
|
| 56 | 56 |
/** The factory. */ |
| 57 | 57 |
private def factory; |
| 58 |
|
|
| 58 |
|
|
| 59 | 59 |
/** The parser. */ |
| 60 | 60 |
private XMLStreamReader parser; |
| 61 |
|
|
| 61 |
|
|
| 62 | 62 |
/** The output. */ |
| 63 | 63 |
private BufferedWriter output; |
| 64 |
|
|
| 64 |
|
|
| 65 | 65 |
/** |
| 66 | 66 |
* Instantiates a new builds the tt src. |
| 67 | 67 |
* uses XML-TXM V2 |
| ... | ... | |
| 74 | 74 |
inputData = url.openStream(); |
| 75 | 75 |
factory = XMLInputFactory.newInstance(); |
| 76 | 76 |
parser = factory.createXMLStreamReader(inputData); |
| 77 |
|
|
| 77 |
|
|
| 78 | 78 |
} catch (XMLStreamException ex) {
|
| 79 | 79 |
System.out.println(ex); |
| 80 | 80 |
} catch (IOException ex) {
|
| 81 | 81 |
System.out.println("IOException while parsing ");
|
| 82 | 82 |
} |
| 83 | 83 |
} |
| 84 |
|
|
| 84 |
|
|
| 85 | 85 |
/** |
| 86 | 86 |
* Creates the output. |
| 87 | 87 |
* |
| ... | ... | |
| 99 | 99 |
return false; |
| 100 | 100 |
} |
| 101 | 101 |
} |
| 102 |
|
|
| 102 |
|
|
| 103 | 103 |
/** |
| 104 | 104 |
* Process. |
| 105 | 105 |
* |
| 106 | 106 |
* @param outfile the outfile |
| 107 |
* @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found
|
|
| 107 |
* @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found
|
|
| 108 | 108 |
* @return true, if successful |
| 109 | 109 |
*/ |
| 110 | 110 |
public boolean process(File outfile, String formtype) {
|
| 111 | 111 |
if (!createOutput(outfile)) |
| 112 | 112 |
return false; |
| 113 |
|
|
| 113 |
|
|
| 114 |
boolean inW = false |
|
| 114 | 115 |
boolean flagform = false; // to catch the content of the form tag |
| 115 | 116 |
boolean firstform = false; // to know if its the first form of the w element |
| 116 | 117 |
String form = ""; // the content of the form tag |
| ... | ... | |
| 122 | 123 |
switch (event) {
|
| 123 | 124 |
case XMLStreamConstants.START_ELEMENT: |
| 124 | 125 |
localname = parser.getLocalName(); |
| 126 |
|
|
| 125 | 127 |
switch (localname) {
|
| 126 | 128 |
case "w": |
| 127 |
//firstform = true; |
|
| 129 |
//firstform = true; |
|
| 130 |
inW = true |
|
| 128 | 131 |
break; |
| 129 | 132 |
case "form": |
| 130 |
// if (firstform) {
|
|
| 131 |
// if (formtype != null) {
|
|
| 132 |
// if(parser.getAttributeCount() > 0 |
|
| 133 |
// && parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type |
|
| 134 |
// flagform = true; |
|
| 135 |
// } |
|
| 136 |
// else |
|
| 133 |
if (inW) {
|
|
| 134 |
// if (firstform) {
|
|
| 135 |
// if (formtype != null) {
|
|
| 136 |
// if(parser.getAttributeCount() > 0 |
|
| 137 |
// && parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type |
|
| 138 |
// flagform = true; |
|
| 139 |
// } |
|
| 140 |
// else |
|
| 137 | 141 |
flagform = true; |
| 138 | 142 |
form = ""; |
| 139 | 143 |
firstform = false; |
| 140 |
//} |
|
| 144 |
//} |
|
| 145 |
} |
|
| 141 | 146 |
break; |
| 142 | 147 |
case "s": // TreeTagger can use s tags |
| 143 | 148 |
buffer.append("<s>\n");
|
| ... | ... | |
| 147 | 152 |
case XMLStreamConstants.END_ELEMENT: |
| 148 | 153 |
localname = parser.getLocalName(); |
| 149 | 154 |
switch (localname) {
|
| 155 |
case "w": |
|
| 156 |
inW = false |
|
| 157 |
break |
|
| 150 | 158 |
case "form": |
| 151 |
flagform = false; |
|
| 152 |
form = form.trim() |
|
| 153 |
if (form.length() == 0) buffer.append("__EMPTY__\n");
|
|
| 154 |
else buffer.append(form.replace("\n", "").replace("<", "<")+ "\n");
|
|
| 155 |
//buffer.append(form+ "\n"); // its a txt file no need to use entities |
|
| 159 |
if (inW) { // ensure to process a form inside a w
|
|
| 160 |
flagform = false; |
|
| 161 |
form = form.trim() |
|
| 162 |
if (form.length() == 0) buffer.append("__EMPTY__\n");
|
|
| 163 |
else buffer.append(form.replace("\n", "").replace("<", "<")+ "\n");
|
|
| 164 |
//buffer.append(form+ "\n"); // its a txt file no need to use entities |
|
| 165 |
} |
|
| 156 | 166 |
break; |
| 157 |
|
|
| 167 |
|
|
| 158 | 168 |
case "s": |
| 159 | 169 |
buffer.append("</s>\n");
|
| 160 | 170 |
break; |
| 161 | 171 |
} |
| 162 | 172 |
break; |
| 163 |
|
|
| 173 |
|
|
| 164 | 174 |
case XMLStreamConstants.CHARACTERS: |
| 165 | 175 |
if (flagform) {
|
| 166 | 176 |
if (parser.getText().length() > 0) |
| ... | ... | |
| 181 | 191 |
} catch (Exception ex) {
|
| 182 | 192 |
System.out.println(ex); |
| 183 | 193 |
return false; |
| 184 |
}
|
|
| 185 |
|
|
| 194 |
} |
|
| 195 |
|
|
| 186 | 196 |
return true; |
| 187 | 197 |
} |
| 188 |
|
|
| 198 |
|
|
| 189 | 199 |
/** |
| 190 | 200 |
* The main method. |
| 191 | 201 |
* |
| 192 | 202 |
* @param args the arguments |
| 193 | 203 |
*/ |
| 194 | 204 |
public static void main(String[] args) {
|
| 195 |
|
|
| 205 |
|
|
| 196 | 206 |
String rootDir = "~/xml/rgaqcj/"; |
| 197 | 207 |
// new File(rootDir+"/identity/").mkdir(); |
| 198 |
|
|
| 208 |
|
|
| 199 | 209 |
ArrayList<String> milestones = new ArrayList<String>();// the tags who |
| 200 | 210 |
// you want them |
| 201 | 211 |
// to stay |
| ... | ... | |
| 204 | 214 |
milestones.add("pb");
|
| 205 | 215 |
milestones.add("lb");
|
| 206 | 216 |
milestones.add("catRef");
|
| 207 |
|
|
| 217 |
|
|
| 208 | 218 |
File srcfile = new File(rootDir + "anainline/", "roland-p5.xml"); |
| 209 | 219 |
File resultfile = new File(rootDir + "ttsrc/", "roland-p5.tt"); |
| 210 | 220 |
println("build ttsrc file : " + srcfile + " to : " + resultfile);
|
| 211 |
|
|
| 221 |
|
|
| 212 | 222 |
def builder = new BuildTTSrc(srcfile.toURL(), milestones); |
| 213 | 223 |
builder.process(resultfile); |
| 214 |
|
|
| 224 |
|
|
| 215 | 225 |
return; |
| 216 | 226 |
} |
| 217 |
|
|
| 227 |
|
|
| 218 | 228 |
} |
Formats disponibles : Unified diff