Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / bfm / importer.groovy @ 1000

History | View | Annotate | Download (9.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-11 08:38:47 +0200 (mer. 11 mai 2016) $
25
// $LastChangedRevision: 3211 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.importer.bfm;
29

    
30
import org.txm.importer.*;
31
import org.txm.scripts.importer.*;
32
import org.txm.importer.scripts.filters.*;
33
import org.txm.scripts.*;
34
import org.txm.importer.scripts.xmltxm.*;
35
import org.txm.utils.io.FileCopy;
36
import org.txm.utils.treetagger.TreeTagger;
37

    
38
import javax.xml.stream.*;
39

    
40
import java.io.File;
41
import java.net.URL;
42
import java.util.Properties;
43

    
44
import org.txm.*;
45
import org.txm.core.engines.*;
46

    
47
import org.txm.scripts.filters.CutHeader.*;
48
import org.txm.scripts.filters.ReunitBrokenWords.*;
49
import org.txm.scripts.filters.Tokeniser.*;
50
import org.txm.scripts.filters.TagSentences.*;
51
import org.txm.scripts.filters.FusionHeader.*;
52

    
53
// TODO: Auto-generated Javadoc
54
/**
55
 * Convert TEI-BFM to TEI-TXM: <br/>
56
 * 1- tokenize : manage BFM's &lt;choice> policy (see <a
57
 * href="http://bfm.ens-lsh.fr">BFM website</a>).<br/>
58
 * 2- launch XML2TEITXM.groovy with no annotation to import. <br/>
59
 *
60
 * @param dir contains the sources files
61
 * @param basename the basename
62
 * @return true, if successful
63
 * @author mdecorde
64
 */
65
class importer {
66
        
67
        public boolean run(File srcDir, File binDir, File txmDir, String basename, Properties metadataXPath)
68
        {
69
                new File(binDir, "ptokenized").deleteDir();
70
                new File(binDir, "ptokenized").mkdir();
71
                new File(binDir, "tokenized").deleteDir();
72
                new File(binDir, "tokenized").mkdir();
73
                new File(binDir, "headers").deleteDir();
74
                new File(binDir, "headers").mkdir();
75

    
76
                ArrayList<String> milestones = new ArrayList<String>();
77
                milestones.add("lb");
78
                milestones.add("pb");
79
                milestones.add("milestone");
80

    
81
                List<File> files = txmDir.listFiles();
82
                if (files == null || files.size() == 0) return false;
83

    
84
                for (int i = 0 ; i < files.size() ; i++)
85
                        if (!(files.get(i).getName().endsWith(".xml"))
86
                        || files.get(i).getName().equals("import.xml"))
87
                                files.remove(i--);
88

    
89
                for (File f : files) {
90
                        //System.out.println("Get metadata from file $xpathfile");
91
                        HashMap<String, String> metadataValues = new HashMap<String, String>();
92
                        if (metadataXPath != null) {
93
                                def xpathprocessor = new XPathResult(f)
94
                                for (String name : metadataXPath.keySet()) {
95
                                        String value = xpathprocessor.getXpathResponse(metadataXPath.get(name), "N/A");
96
                                        value = value.trim().replace("\n", " ");
97
                                        //println "xpath: "+metadataXPath.get(name)+" >> "+value.trim()
98
                                        metadataValues.put(name, value);
99
                                }
100
                                xpathprocessor.close();
101
                        }
102
                        
103
                        //forme is mandatory !!!
104
                        String forme = "prose";
105
                        if (metadataValues.containsKey("forme")) {
106
                                String tmp = metadataValues.get("forme");
107
                                if (tmp.startsWith("#forme_"))
108
                                        tmp = tmp.substring(7)
109
                                metadataValues.put("forme", tmp)
110
                                forme = metadataValues.get("forme");
111
                        }
112

    
113
                        if (metadataValues.containsKey("morphosynt")) {
114
                                //println "MORPHOSYNT: "+metadataValues.get("morphosynt");
115
                                if (metadataValues.get("morphosynt").equals("N/A"))
116
                                        metadataValues.put("morphosynt","non vérifié");
117
                                else
118
                                        metadataValues.put("morphosynt","vérifié");
119
                        }
120

    
121
                        String txtname = f.getName().substring(0, f.getName().length()-4);
122
                        String titreId = txtname;
123
                        if (metadataValues.containsKey("sigle")) {
124
                                titreId = metadataValues.get("sigle");
125
                        } else if (metadataValues.containsKey("idbfm")) {
126
                                titreId = metadataValues.get("idbfm");
127
                        }
128
                        metadataValues.put("sigle", titreId);
129
                        
130
                        // inject attributes in $f
131
                        AddAttributeInXml builder = new AddAttributeInXml(f, "text", metadataValues);
132
                        builder.onlyOneElement();
133
                        //println "INJECTING $metadataValues"
134
                        File outfile = new File(txmDir, "copy"+f.getName())
135
                        
136
                        if (!builder.process(outfile)) {
137
                                println "Error while injecting metadata in $f"
138
                                return false;
139
                        }
140
                        builder= null;
141
                        if (f.delete()) {
142
                                outfile.renameTo(f)
143
                        } else {
144
                                println "ERROR could not delete $f for metadatainjection"
145
                                return false
146
                        }
147
                }
148

    
149
                files = txmDir.listFiles();
150
                if (files == null || files.size() == 0) return false;
151
                
152
                //return false; // TEMP TEMP TEMP
153
                
154
                //PREPARE EACH FILE TO BE TOKENIZED
155
                println("preparing "+files.size()+" files for the tokenizer");
156

    
157
                for (File f : files) {
158
                        if (f.isDirectory()) continue;
159
                        print "."//+f.getAbsolutePath()
160
                        File srcfile = f;
161
                        File resultfile = new File(binDir,"ptokenized/"+f.getName());
162

    
163
                        def builder2 = new OneTagPerLine(srcfile.toURI().toURL(), milestones);
164
                        if (!builder2.process(resultfile)) {
165
                                println "Failed to tokenize file: "+f;
166
                                resultfile.delete();
167
                        }
168
                }
169
                println ""
170

    
171
                //TOKENIZE FILES
172
                files = new File(binDir, "ptokenized").listFiles();//scan directory split
173
                println("Tokenizing "+files.size()+" files")
174
                for (File infile : files) {
175
                        print "."
176
                        try {
177
                                Sequence S = new Sequence();
178
                                Filter F1 = new CutHeader();
179
                                Filter F2 = new ReunitBrokenWords();
180
                                Filter F6 = new Tokeniser(infile);
181
                                Filter F7 = new TagSentences();
182
                                Filter F11 = new FusionHeader();
183
                                S.add(F1);
184
                                //S.add(F2);
185
                                S.add(F6);
186
                                //S.add(F7);
187
                                S.add(F11);
188

    
189
                                File xmlfile = new File(binDir,"tokenized/"+infile.getName());
190

    
191
                                S.SetInFileAndOutFile(infile.getAbsolutePath(), xmlfile.getAbsolutePath());
192
                                S.setEncodages("UTF-8","UTF-8");
193

    
194
                                Object[] arguments1 = [new File(binDir,"/headers/"+infile.getName()+"header.xml").getAbsolutePath()];
195
                                F1.SetUsedParam(arguments1);
196
                                Object[] arguments2 = [new File(binDir,"/headers/"+infile.getName()+"header.xml").getAbsolutePath(),F1];
197
                                F11.SetUsedParam(arguments2);
198

    
199
                                S.proceed();
200
                                S.clean();
201
                        } catch(Exception e) {
202
                                org.txm.utils.logger.Log.printStackTrace(e);
203
                                return false;
204
                        }
205
                }
206
                println ""
207

    
208

    
209
                files = new File(binDir,"tokenized").listFiles()
210
                println "Validating XML of "+files.size()+" files"
211
                for (File f : files) {
212
                        print "."
213
                        if (!ValidateXml.test(f)) {
214
                                System.out.println("XML NOT VALID "+f);
215
                                new File(binDir, "tokenizerError").mkdir();
216
                                FileCopy.copy(f,new File(binDir, "tokenizerError/"+f.getName()));
217
                                f.delete();
218
                        }
219
                }
220
                println("");
221

    
222
                //TRANSFORM INTO XML-TEI-TXM
223
                files = new File(binDir, "tokenized").listFiles()
224
                println("Building xml-tei-txm "+files.size()+ " files")
225
                if (files.size() == 0) return false;
226
                for (File file : files) {
227
                        print "."
228
                        String txmfile = file.getName();
229

    
230
                        def correspType = new HashMap<String,String>()
231
                        def correspRef = new HashMap<String,String>()
232

    
233
                        //il faut lister les id de tous les respStmt
234
                        def respId = [];
235
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
236
                        def applications = new HashMap<String,HashMap<String,String>>();
237
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
238
                        //pour construire les ref vers les taxonomies
239
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
240
                        //associe un id d'item avec sa description et son URI
241
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
242
                        //informations de respStmt
243
                        //resps (respId <voir ci-dessus>, [description, person, date])
244
                        def resps = new HashMap<String,String[]>();
245

    
246
                        //lance le traitement
247
                        def builder3 = new Xml2Ana(file);
248
                        builder3.setCorrespondances(correspRef, correspType);
249
                        builder3.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
250
                        if (!builder3.process(new File(txmDir,txmfile))) {
251
                                println "Failed to build xml-txm file from file: "+file;
252
                                return false;
253
                        }
254
                }
255
                println("")
256
                return true;
257
        }
258

    
259
        /**
260
         * for test purpose.
261
         *
262
         * @param args the arguments
263
         */
264
        public static void main(String[] args) {
265
                long start = System.currentTimeMillis();
266
                File dir = new File("/home/mdecorde/xml/qgraal")
267
                File srcfile = new File(dir, "qgraal_cm-detoken.xml");
268
                File resultfile = new File(dir,"QGRAALFRO-pt1.xml");
269
                File xmlfile = new File(dir,"QGRAALFRO-t1.xml");
270

    
271
                def builder2 = new OneTagPerLine(srcfile.toURI().toURL(), []);
272
                if (!builder2.process(resultfile)) {
273
                        println "Failed to tokenize file: "+srcfile;
274
                        resultfile.delete();
275
                }
276

    
277
                try {
278
                        Sequence S = new Sequence();
279
                        Filter F1 = new CutHeader();
280
                        Filter F2 = new ReunitBrokenWords();
281
                        Filter F6 = new Tokeniser(resultfile);
282
                        Filter F7 = new TagSentences();
283
                        Filter F11 = new FusionHeader();
284
                        S.add(F1);
285
                        //S.add(F2);
286
                        S.add(F6);
287
                        //S.add(F7);
288
                        //S.add(F11);
289

    
290
                        S.SetInFileAndOutFile(resultfile.getAbsolutePath(), xmlfile.getAbsolutePath());
291
                        S.setEncodages("UTF-8","UTF-8");
292

    
293
                        Object[] arguments1 = [new File(dir,"header.xml").getAbsolutePath()];
294
                        F1.SetUsedParam(arguments1);
295
                        Object[] arguments2 = [new File(dir,"header.xml").getAbsolutePath(),F1];
296
                        F11.SetUsedParam(arguments2);
297

    
298
                        S.proceed();
299
                        S.clean();
300
                } catch(Exception e) {
301
                        org.txm.utils.logger.Log.printStackTrace(e);
302
                }
303
                println "time: "+(System.currentTimeMillis()-start)/1000
304
        }
305
}