root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / alceste / importer.groovy @ 1000
History | View | Annotate | Download (5.8 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | //
|
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | //
|
24 | 479 | mdecorde | // $LastChangedDate: 2017-06-26 16:53:47 +0200 (lun. 26 juin 2017) $
|
25 | 479 | mdecorde | // $LastChangedRevision: 3451 $
|
26 | 321 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 321 | mdecorde | //
|
28 | 986 | mdecorde | package org.txm.scripts.importer.alceste
|
29 | 321 | mdecorde | |
30 | 321 | mdecorde | |
31 | 986 | mdecorde | //import org.txm.scripts.filters.TabulatedToXml.*;
|
32 | 986 | mdecorde | import org.txm.scripts.importer.*; |
33 | 1000 | mdecorde | import org.txm.importer.scripts.filters.*; |
34 | 321 | mdecorde | import org.txm.scripts.*; |
35 | 1000 | mdecorde | import org.txm.importer.cwb.CwbEncode |
36 | 1000 | mdecorde | import org.txm.importer.cwb.CwbMakeAll |
37 | 321 | mdecorde | import org.txm.utils.io.FileCopy; |
38 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.*; |
39 | 321 | mdecorde | |
40 | 321 | mdecorde | import javax.xml.stream.*; |
41 | 321 | mdecorde | |
42 | 321 | mdecorde | import java.io.BufferedWriter; |
43 | 321 | mdecorde | import java.io.File; |
44 | 321 | mdecorde | import java.io.FileOutputStream; |
45 | 321 | mdecorde | import java.io.OutputStreamWriter; |
46 | 321 | mdecorde | import java.io.PrintStream; |
47 | 321 | mdecorde | import java.net.URL; |
48 | 321 | mdecorde | |
49 | 986 | mdecorde | import org.txm.scripts.filters.CutHeader.*; |
50 | 986 | mdecorde | import org.txm.scripts.filters.Tokeniser.*; |
51 | 986 | mdecorde | import org.txm.scripts.filters.FusionHeader.*; |
52 | 986 | mdecorde | import org.txm.scripts.filters.TagSentences.*; |
53 | 321 | mdecorde | |
54 | 321 | mdecorde | import org.txm.Toolbox; |
55 | 321 | mdecorde | |
56 | 321 | mdecorde | // TODO: Auto-generated Javadoc
|
57 | 321 | mdecorde | /**
|
58 | 321 | mdecorde | * The Class importer.
|
59 | 321 | mdecorde | */
|
60 | 321 | mdecorde | class importer { |
61 | 321 | mdecorde | |
62 | 321 | mdecorde | /**
|
63 | 321 | mdecorde | * Run.
|
64 | 321 | mdecorde | *
|
65 | 321 | mdecorde | * @param rootDirFile the root dir file
|
66 | 321 | mdecorde | * @param encoding the encoding
|
67 | 321 | mdecorde | * @param basename the basename
|
68 | 321 | mdecorde | * @return true, if successful
|
69 | 321 | mdecorde | */
|
70 | 321 | mdecorde | public static boolean run(File rootDirFile, File binDir, File txmDir, String encoding, String basename, String lang) |
71 | 321 | mdecorde | { |
72 | 321 | mdecorde | if (rootDirFile.listFiles() == null || rootDirFile.listFiles().size() == 0) { |
73 | 321 | mdecorde | println "Error: no file to process in "+rootDirFile;
|
74 | 321 | mdecorde | return false; |
75 | 321 | mdecorde | } |
76 | 321 | mdecorde | File srcfile;
|
77 | 321 | mdecorde | for (File f : rootDirFile.listFiles()) { |
78 | 321 | mdecorde | if (!f.isHidden() && f.canRead() && f.isFile() && f.getName() != "import.xml" && !f.getName().endsWith("~")) { |
79 | 321 | mdecorde | srcfile = f |
80 | 321 | mdecorde | break;
|
81 | 321 | mdecorde | } |
82 | 321 | mdecorde | } |
83 | 321 | mdecorde | |
84 | 321 | mdecorde | File splitDir = new File(binDir, "split"); |
85 | 321 | mdecorde | File tokenizeDir = new File(binDir, "tokenized"); |
86 | 321 | mdecorde | File stokenizeDir = new File(binDir,"stokenized") |
87 | 321 | mdecorde | stokenizeDir.deleteDir(); |
88 | 321 | mdecorde | stokenizeDir.mkdir(); |
89 | 321 | mdecorde | tokenizeDir.deleteDir(); |
90 | 321 | mdecorde | tokenizeDir.mkdir(); |
91 | 321 | mdecorde | splitDir.deleteDir(); |
92 | 321 | mdecorde | splitDir.mkdir(); |
93 | 321 | mdecorde | |
94 | 321 | mdecorde | // Build xml files from the alceste file
|
95 | 321 | mdecorde | Alceste2Xml transformer = new Alceste2Xml()
|
96 | 321 | mdecorde | if (!transformer.run(srcfile, splitDir, encoding)) {
|
97 | 321 | mdecorde | return false; |
98 | 321 | mdecorde | } |
99 | 321 | mdecorde | |
100 | 321 | mdecorde | // Tokenize xml files
|
101 | 321 | mdecorde | List<File> srcfiles = splitDir.listFiles(); |
102 | 321 | mdecorde | println (srcfiles.size()+ " texts found in "+srcfile)
|
103 | 321 | mdecorde | |
104 | 321 | mdecorde | println("Tokenizing files ("+srcfiles.size()+")") |
105 | 321 | mdecorde | for (File f : srcfiles) { |
106 | 321 | mdecorde | print "."
|
107 | 321 | mdecorde | File resultfile = new File(tokenizeDir, f.getName()); |
108 | 321 | mdecorde | try {
|
109 | 321 | mdecorde | def builder = new SimpleTokenizerXml(f, resultfile, lang); |
110 | 321 | mdecorde | builder.setStartTag("text")
|
111 | 321 | mdecorde | if (!builder.process()) {
|
112 | 321 | mdecorde | println "Failed to tokenize: "+f;
|
113 | 321 | mdecorde | resultfile.delete(); |
114 | 321 | mdecorde | } |
115 | 321 | mdecorde | } |
116 | 321 | mdecorde | catch(Exception e){org.txm.utils.logger.Log.printStackTrace(e); println "Failed to tokenize: "+f; return false;} |
117 | 321 | mdecorde | } |
118 | 321 | mdecorde | println ""
|
119 | 321 | mdecorde | |
120 | 321 | mdecorde | List<File> stokenfiles = tokenizeDir.listFiles(); |
121 | 321 | mdecorde | println("Tagging sentences of "+stokenfiles.size()+" files") |
122 | 321 | mdecorde | for (File f : stokenfiles) { |
123 | 321 | mdecorde | print "."
|
124 | 321 | mdecorde | Sequence S = new Sequence(); |
125 | 321 | mdecorde | Filter F1 = new CutHeader(); |
126 | 321 | mdecorde | Filter F7 = new TagSentences(); |
127 | 321 | mdecorde | Filter F11 = new FusionHeader(); |
128 | 321 | mdecorde | S.add(F1); |
129 | 321 | mdecorde | S.add(F7); |
130 | 321 | mdecorde | S.add(F11); |
131 | 321 | mdecorde | File infile = f;
|
132 | 321 | mdecorde | File xmlfile = new File(stokenizeDir, f.getName()); |
133 | 321 | mdecorde | File headerfile = new File(f.getParentFile(), f.getName()+"header.xml"); |
134 | 321 | mdecorde | |
135 | 321 | mdecorde | S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath()); |
136 | 321 | mdecorde | S.setEncodages("UTF-8","UTF-8"); |
137 | 321 | mdecorde | Object[] arguments1 = [headerfile.getAbsolutePath()]; |
138 | 321 | mdecorde | F1.SetUsedParam(arguments1); |
139 | 321 | mdecorde | Object[] arguments2 = [headerfile.getAbsolutePath(),F1]; |
140 | 321 | mdecorde | F11.SetUsedParam(arguments2); |
141 | 321 | mdecorde | if (!S.proceed()) {
|
142 | 321 | mdecorde | println "Failed to tag file: "+f
|
143 | 321 | mdecorde | } |
144 | 321 | mdecorde | S.clean(); |
145 | 321 | mdecorde | headerfile.delete();//remove the prepared file to clean
|
146 | 321 | mdecorde | } |
147 | 321 | mdecorde | println ""
|
148 | 321 | mdecorde | |
149 | 321 | mdecorde | //TRANSFORM INTO XML-TEI-TXM
|
150 | 321 | mdecorde | List<File> tokenfiles = stokenizeDir.listFiles() |
151 | 321 | mdecorde | println("Building xml-tei-txm ("+tokenfiles.size()+" files)") |
152 | 321 | mdecorde | for (File f : tokenfiles) { |
153 | 321 | mdecorde | print "."
|
154 | 321 | mdecorde | File file = f; |
155 | 321 | mdecorde | File txmfile = new File(txmDir, f.getName()); |
156 | 321 | mdecorde | |
157 | 321 | mdecorde | def correspType = new HashMap<String, String>() |
158 | 321 | mdecorde | def correspRef = new HashMap<String, String>() |
159 | 321 | mdecorde | //il faut lister les id de tous les respStmt
|
160 | 321 | mdecorde | def respId = []; |
161 | 321 | mdecorde | //fait la correspondance entre le respId et le rapport d'execution de l'outil
|
162 | 321 | mdecorde | def applications = new HashMap<String,HashMap<String,String>>(); |
163 | 321 | mdecorde | //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
|
164 | 321 | mdecorde | //pour construire les ref vers les taxonomies
|
165 | 321 | mdecorde | def taxonomiesUtilisees = new HashMap<String,String[]>(); |
166 | 321 | mdecorde | //associe un id d'item avec sa description et son URI
|
167 | 321 | mdecorde | def itemsURI = new HashMap<String,HashMap<String,String>>(); |
168 | 321 | mdecorde | //informations de respStmt
|
169 | 321 | mdecorde | //resps (respId <voir ci-dessus>, [description, person, date])
|
170 | 321 | mdecorde | def resps = new HashMap<String,String[]>(); |
171 | 321 | mdecorde | //lance le traitement
|
172 | 321 | mdecorde | def builder = new Xml2Ana(file); |
173 | 625 | mdecorde | builder.setConvertAllAtrtibutes true;
|
174 | 321 | mdecorde | builder.setCorrespondances(correspRef, correspType); |
175 | 321 | mdecorde | builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI) |
176 | 321 | mdecorde | if(!builder.process(txmfile))
|
177 | 321 | mdecorde | { |
178 | 321 | mdecorde | println "Failed to build xml-txm of file: "+file;
|
179 | 321 | mdecorde | txmfile.delete(); |
180 | 321 | mdecorde | } |
181 | 321 | mdecorde | } |
182 | 321 | mdecorde | println ""
|
183 | 321 | mdecorde | return true; |
184 | 321 | mdecorde | } |
185 | 321 | mdecorde | } |