Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / BuildTTSrc.groovy @ 1000

History | View | Annotate | Download (5.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.scripts.teitxm;
29

    
30
import java.text.DateFormat;
31
import java.util.Date;
32
import java.util.ArrayList;
33
import javax.xml.stream.*;
34
import java.net.URL;
35

    
36
import org.txm.Toolbox;
37
import org.txm.importer.scripts.filters.*;
38

    
39
// TODO: Auto-generated Javadoc
40
/**
41
 * The Class BuildTTSrc.
42
 *
43
 * @author mdecorde
44
 * build the TT source for tigerSearch
45
 */
46

    
47
public class BuildTTSrc {
48
        
49
        /** The url. */
50
        private def url;
51
        
52
        /** The input data. */
53
        private def inputData;
54
        
55
        /** The factory. */
56
        private def factory;
57
        
58
        /** The parser. */
59
        private XMLStreamReader parser;
60
        
61
        /** The output. */
62
        private BufferedWriter output;
63
        
64
        /**
65
         * Instantiates a new builds the tt src.
66
         * uses XML-TXM V2
67
         *
68
         * @param url the url of the file to process
69
         */
70
        public BuildTTSrc(URL url) {
71
                try {
72
                        this.url = url;
73
                        inputData = url.openStream();
74
                        factory = XMLInputFactory.newInstance();
75
                        parser = factory.createXMLStreamReader(inputData);
76
                        
77
                } catch (XMLStreamException ex) {
78
                        System.out.println(ex);
79
                } catch (IOException ex) {
80
                        System.out.println("IOException while parsing ");
81
                }
82
        }
83
        
84
        /**
85
         * Creates the output.
86
         *
87
         * @param outfile the outfile
88
         * @return true, if successful
89
         */
90
        private boolean createOutput(File outfile) {
91
                try {
92
                        File f = outfile;
93
                        output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile),
94
                                        "UTF-8"));
95
                        return true;
96
                } catch (Exception e) {
97
                        System.out.println(e.getLocalizedMessage());
98
                        return false;
99
                }
100
        }
101
        
102
        /**
103
         * Process.
104
         *
105
         * @param outfile the outfile
106
     * @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found
107
         * @return true, if successful
108
         */
109
        public boolean process(File outfile, String formtype) {
110
                if (!createOutput(outfile))
111
                        return false;
112
                
113
                boolean flagform = false; // to catch the content of the form tag
114
                boolean firstform = false; // to know if its the first form of the w element
115
                String form = ""; // the content of the form tag
116
                String lastopenlocalname = "";
117
                String localname = "";
118
                StringBuffer buffer = new StringBuffer();
119
                try {
120
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
121
                                switch (event) {
122
                                        case XMLStreamConstants.START_ELEMENT:
123
                                                localname = parser.getLocalName();
124
                                                switch (localname) {
125
                                                        case "w":
126
                                                                //firstform = true;
127
                                                                break;
128
                                                        case "form":
129
//                                                                if (firstform) {
130
//                                                                        if (formtype != null) {
131
//                                                                                if(parser.getAttributeCount() > 0 
132
//                                                                                        && parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type
133
//                                                                                        flagform = true;
134
//                                                                        }
135
//                                                                        else
136
                                                                        flagform = true;
137
                                                                        form = "";
138
                                                                        firstform = false;
139
                                                                //}
140
                                                                break;
141
                                                        case "s": // TreeTagger can use s tags
142
                                                                buffer.append("<s>\n");
143
                                                                break;
144
                                                }
145
                                                break;
146
                                        case XMLStreamConstants.END_ELEMENT:
147
                                                localname = parser.getLocalName();
148
                                                switch (localname) {
149
                                                        case "form":
150
                                                                flagform = false;
151
                                                                form = form.trim()
152
                                                                if (form.length() == 0) buffer.append("__EMPTY__\n");
153
                                                                else buffer.append(form.replace("\n", "").replace("<", "&lt;")+ "\n");
154
                                                                //buffer.append(form+ "\n"); // its a txt file no need to use entities
155
                                                                break;
156
                                                        
157
                                                        case "s":
158
                                                                buffer.append("</s>\n");
159
                                                                break;
160
                                                }
161
                                                break;
162
                                        
163
                                        case XMLStreamConstants.CHARACTERS:
164
                                                if (flagform) {
165
                                                        if (parser.getText().length() > 0)
166
                                                                form += parser.getText();
167
                                                }
168
                                                break;
169
                                }
170
                        }
171

    
172
                        String str = buffer.toString()
173
                        if ("false".equals(Toolbox.getPreference(Toolbox.TREETAGGER_APOSTROPHE))) {
174
                                str = str.replace("", "'").replace("", "'");
175
                        }
176
                        output.write(str)
177
                        output.close();
178
                        parser.close();
179
                        inputData.close();
180
                } catch (Exception ex) {
181
                        System.out.println(ex);
182
                        return false;
183
                } 
184
                
185
                return true;
186
        }
187
        
188
        /**
189
         * The main method.
190
         *
191
         * @param args the arguments
192
         */
193
        public static void main(String[] args) {
194
                
195
                String rootDir = "~/xml/rgaqcj/";
196
                // new File(rootDir+"/identity/").mkdir();
197
                
198
                ArrayList<String> milestones = new ArrayList<String>();// the tags who
199
                // you want them
200
                // to stay
201
                // milestones
202
                milestones.add("tagUsage");
203
                milestones.add("pb");
204
                milestones.add("lb");
205
                milestones.add("catRef");
206
                
207
                File srcfile = new File(rootDir + "anainline/", "roland-p5.xml");
208
                File resultfile = new File(rootDir + "ttsrc/", "roland-p5.tt");
209
                println("build ttsrc file : " + srcfile + " to : " + resultfile);
210
                
211
                def builder = new BuildTTSrc(srcfile.toURL(), milestones);
212
                builder.process(resultfile);
213
                
214
                return;
215
        }
216
        
217
}