Révision 3306
TXM/trunk/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TSImporter.groovy (revision 3306) | ||
---|---|---|
6 | 6 |
import org.txm.importer.xtz.* |
7 | 7 |
import org.txm.metadatas.Metadatas |
8 | 8 |
import org.txm.scripts.importer.xtz.* |
9 |
import org.txm.tigersearch.preferences.TigerSearchPreferences |
|
9 | 10 |
import org.txm.utils.BundleUtils |
10 | 11 |
import org.txm.utils.FileUtils |
11 | 12 |
import org.txm.utils.io.FileCopy |
... | ... | |
42 | 43 |
Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1) |
43 | 44 |
} |
44 | 45 |
|
45 |
|
|
46 | 46 |
File sourceDirectory = inputDirectory |
47 | 47 |
File binaryDirectory = module.getBinaryDirectory() |
48 |
|
|
49 |
File master = new File(sourceDirectory, "main.xml") |
|
50 | 48 |
|
49 |
String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME); |
|
50 |
String headerFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.HEADER_FILENAME); |
|
51 |
|
|
52 |
File master = new File(sourceDirectory, driverFilename) |
|
53 |
|
|
51 | 54 |
def xmlFiles = [] // the TIGER XML files |
52 |
if (!master.exists() ) {
|
|
53 |
println "No main.xml ($master) file found, trying to find a proper TIGER XML file." |
|
55 |
if (true || !master.exists() ) { // Managing the main.xml file is a pain, we'll do it later
|
|
56 |
//println "No main.xml ($master) file found, trying to find a proper TIGER XML file."
|
|
54 | 57 |
// master = new File(sourceDirectory, "main.xml") |
55 | 58 |
// String subcorpora = ""; |
56 | 59 |
xmlFiles = sourceDirectory.listFiles(new FileFilter() { |
... | ... | |
60 | 63 |
String filename = file.getName() |
61 | 64 |
if (filename.equals("import.xml")) return false; |
62 | 65 |
if (!filename.endsWith(".xml")) return false; |
66 |
if (filename.equals(driverFilename)) return false; |
|
67 |
if (filename.equals(headerFilename)) return false; |
|
63 | 68 |
|
64 | 69 |
return true; |
65 | 70 |
} |
66 | 71 |
}); |
67 | 72 |
|
68 |
if (xmlFiles.size() > 1) { |
|
69 |
println "Error, the source directory contains more than one TIGER XML file ?" |
|
70 |
isSuccessFul = false; |
|
71 |
return; |
|
72 |
} |
|
73 |
if (xmlFiles.size() == 0) { |
|
74 |
println "Error no XML file found in $sourceDirectory directory" |
|
75 |
isSuccessFul = false; |
|
76 |
return; |
|
77 |
} |
|
78 |
println "Using ${xmlFiles.get(0)} as TIGER XML source file." |
|
73 |
println "Using ${xmlFiles} as TIGER XML source files." |
|
79 | 74 |
|
80 | 75 |
} else { // parse the master file |
76 |
println "Using the TIGER MAIN file: $master" |
|
77 |
|
|
78 |
|
|
81 | 79 |
for (def s : new XmlSlurper().parse(master).body.subcorpus) { |
82 | 80 |
String name = ""+s.@external |
83 | 81 |
if (name.startsWith("file:")) { |
... | ... | |
101 | 99 |
File tsXSLFile = new File(Toolbox.getTxmHomePath(), "xsl/ts.xsl"); |
102 | 100 |
BundleUtils.copyFiles("org.txm.tigersearch.rcp", "groovy", "org/txm/scripts/importer/tigersearch", "ts.xsl", tsXSLFile.getParentFile()); |
103 | 101 |
|
104 |
File xmltxmSrcDir = new File(binaryDirectory, "src"); // output directory of the TS XSL transformation |
|
105 |
xmltxmSrcDir.mkdirs(); |
|
106 |
if (master.exists()) println "TIGER MAIN file: $master" |
|
107 |
println "TIGER XML files: $xmlFiles" |
|
102 |
File tigerXmlSrcDir = new File(binaryDirectory, "src"); // output directory of the TS XSL transformation |
|
103 |
tigerXmlSrcDir.deleteDir() |
|
104 |
tigerXmlSrcDir.mkdirs(); |
|
105 |
|
|
106 |
println "TIGER-XML files: $xmlFiles" |
|
108 | 107 |
for (File xmlTigerFile : xmlFiles) { |
109 |
FileCopy.copy(xmlTigerFile, new File(xmltxmSrcDir, xmlTigerFile.getName()));
|
|
108 |
FileCopy.copy(xmlTigerFile, new File(tigerXmlSrcDir, xmlTigerFile.getName()));
|
|
110 | 109 |
} |
111 | 110 |
|
112 | 111 |
File tokenizedDir = new File(module.getBinaryDirectory(),"tokenized"); |
113 | 112 |
|
114 |
if (!ApplyXsl2.processImportSources(tsXSLFile, xmltxmSrcDir, tokenizedDir)) {
|
|
115 |
println "Error while applying TS XSL file to $xmltxmSrcDir"
|
|
113 |
if (!ApplyXsl2.processImportSources(tsXSLFile, tigerXmlSrcDir, tokenizedDir)) {
|
|
114 |
println "Error while applying TS XSL file to $tigerXmlSrcDir"
|
|
116 | 115 |
isSuccessFul = false; |
117 | 116 |
return; |
118 | 117 |
} |
119 | 118 |
|
120 | 119 |
File[] files = tokenizedDir.listFiles(IOUtils.HIDDENFILE_FILTER); |
121 | 120 |
if (files == null || files.length == 0) { |
122 |
println "Error while applying TS XSL file to $xmltxmSrcDir is empty"
|
|
121 |
println "Error while applying TS XSL file to $tigerXmlSrcDir is empty"
|
|
123 | 122 |
isSuccessFul = false; |
124 | 123 |
return; |
125 | 124 |
} |
126 | 125 |
|
127 |
if (!doToXMLTXMStep()) return; |
|
128 |
if (!doInjectMetadataStep()) return; |
|
126 |
if (!doToXMLTXMStep()) return; // build the XML-TXM files |
|
127 |
|
|
128 |
if (!doInjectMetadataStep()) return; // inject the metadata in the XML-TXM files |
|
129 | 129 |
|
130 |
module.orderedFiles = new ArrayList<String>(); |
|
131 |
for (def f : xmlFiles) { |
|
132 |
String id = FileUtils.stripExtension(f) |
|
133 |
module.orderedFiles.add(id) |
|
134 |
} |
|
135 | 130 |
isSuccessFul = outputDirectory.listFiles(IOUtils.HIDDENFILE_FILTER).size() > 0 |
136 | 131 |
|
137 | 132 |
String cleanDirectories = project.getCleanAfterBuild(); |
TXM/trunk/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TSImport.groovy (revision 3306) | ||
---|---|---|
17 | 17 |
import org.txm.importer.xtz.* |
18 | 18 |
import org.txm.scripts.importer.xtz.* |
19 | 19 |
import org.txm.searchengine.ts.TIGERSearchEngine |
20 |
import org.txm.tigersearch.preferences.TigerSearchPreferences |
|
20 | 21 |
|
21 | 22 |
class TSImport extends XTZImport { |
22 | 23 |
|
... | ... | |
43 | 44 |
super.start(); // call the usual XTZ import |
44 | 45 |
|
45 | 46 |
if (isSuccessful) { |
46 |
TIGERSearchEngine.buildTIGERCorpus(this.sourceDirectory, this.binaryDirectory, corpusName); |
|
47 |
|
|
48 |
File tigerXmlSrcDir = new File(binaryDirectory, "src") |
|
49 |
|
|
50 |
String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME); |
|
51 |
|
|
52 |
File master = new File(tigerXmlSrcDir, driverFilename) |
|
53 |
|
|
54 |
File srcmaster = new File(sourceDirectory, driverFilename) |
|
55 |
if (srcmaster.exists()) { |
|
56 |
println "Using source TIGER driver file: $srcmaster -> $master" |
|
57 |
FileCopy.copy(srcmaster, master) |
|
58 |
} else { |
|
59 |
println "Building TIGER driver file: $master..." |
|
60 |
String headerFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.HEADER_FILENAME); |
|
61 |
File header = new File(sourceDirectory, headerFilename) |
|
62 |
if (!header.exists()) { |
|
63 |
// parse TIGER-XML files to buidl a default header file |
|
64 |
} |
|
65 |
|
|
66 |
//build the master file using the TIGER-XML files of sourceDirectory |
|
67 |
// if necesary converts a driver (with <corpus>) file into a subcorpus (with <subcorpus>file |
|
68 |
def xmlFiles = tigerXmlSrcDir.listFiles() |
|
69 |
|
|
70 |
HashMap<String, HashSet<String>> tfeatures = new HashMap<String, HashSet<String>>() |
|
71 |
HashMap<String, HashSet<String>> ntfeatures = new HashMap<String, HashSet<String>>() |
|
72 |
HashSet<String> edges = new HashSet<String>() |
|
73 |
HashSet<String> secedges = new HashSet<String>() |
|
74 |
for (def xmlFile : xmlFiles) { |
|
75 |
//println xmlFile |
|
76 |
def doc = new XmlSlurper().parse(xmlFile) |
|
77 |
def terminals = null |
|
78 |
|
|
79 |
if (doc.name() == "corpus") terminals = doc.body.s.graph.terminals |
|
80 |
else terminals = doc.s.graph.terminals |
|
81 |
|
|
82 |
for (def terminal : doc.s.graph.terminals) { |
|
83 |
|
|
84 |
for (def t : terminal.t) { |
|
85 |
def attributes = t.attributes() |
|
86 |
for (def a : attributes.keySet()) { |
|
87 |
if (a == "id") continue; |
|
88 |
|
|
89 |
if (!tfeatures.containsKey(a)) { |
|
90 |
tfeatures[a] = new HashSet<String>(); |
|
91 |
} |
|
92 |
|
|
93 |
tfeatures[a].add(""+attributes[a]) |
|
94 |
|
|
95 |
} |
|
96 |
} |
|
97 |
} |
|
98 |
|
|
99 |
def nonterminals = null |
|
100 |
if (doc.name() == "corpus") nonterminals = doc.body.s.graph.nonterminals |
|
101 |
else nonterminals = doc.s.graph.nonterminals |
|
102 |
|
|
103 |
for (def nterminal : nonterminals) { |
|
104 |
|
|
105 |
for (def nt : nterminal.nt) { |
|
106 |
def attributes = nt.attributes() |
|
107 |
for (def a : attributes.keySet()) { |
|
108 |
if (a == "id") continue; |
|
109 |
|
|
110 |
if (!ntfeatures.containsKey(a)) { |
|
111 |
ntfeatures[a] = new HashSet<String>(); |
|
112 |
} |
|
113 |
|
|
114 |
ntfeatures[a].add(""+attributes[a]) |
|
115 |
|
|
116 |
} |
|
117 |
for (def edge : nt.edge) { |
|
118 |
edges.add(""+edge.@label) |
|
119 |
} |
|
120 |
for (def secedge : nt.secedge) { |
|
121 |
secedges.add(""+secedge.@label) |
|
122 |
} |
|
123 |
} |
|
124 |
} |
|
125 |
|
|
126 |
if (doc.name() == "corpus") { // convert file to a subcorpus file |
|
127 |
doc.name = "subcorpus" |
|
128 |
doc.head = {} |
|
129 |
} |
|
130 |
} |
|
131 |
|
|
132 |
def masterwriter = IOUtils.getWriter(master, "UTF-8") |
|
133 |
masterwriter.println("""<?xml version="1.0" encoding="utf-8"?> |
|
134 |
<corpus id="$corpusName"> |
|
135 |
<head> |
|
136 |
<meta> |
|
137 |
<name>$corpusName</name> |
|
138 |
<author>TXM</author> |
|
139 |
<date></date> |
|
140 |
<description>default master file generated by TXM for the TIGER import module</description> |
|
141 |
<format>TIGER-XML</format> |
|
142 |
<history></history> |
|
143 |
</meta> |
|
144 |
<annotation>""") |
|
145 |
for (def f : tfeatures.keySet()) { |
|
146 |
masterwriter.println(""" <feature domain="T" name="$f">""") |
|
147 |
for (def v : tfeatures[f]) { |
|
148 |
v = v.replace("<", "<").replace(">", ">") |
|
149 |
masterwriter.println("""<value name="$v"/>""") |
|
150 |
} |
|
151 |
masterwriter.println(""" </feature>""") |
|
152 |
} |
|
153 |
masterwriter.flush() |
|
154 |
for (def f : ntfeatures.keySet()) { |
|
155 |
masterwriter.println(""" <feature domain="NT" name="$f">""") |
|
156 |
for (def v : ntfeatures[f]) { |
|
157 |
v = v.replace("<", "<").replace(">", ">") |
|
158 |
masterwriter.println(""" <value name="$v"/>""") |
|
159 |
} |
|
160 |
masterwriter.println(""" </feature>""") |
|
161 |
} |
|
162 |
masterwriter.flush() |
|
163 |
|
|
164 |
masterwriter.println(""" <edgelabel>""") |
|
165 |
for (def v : edges) { |
|
166 |
v = v.replace("<", "<").replace(">", ">") |
|
167 |
masterwriter.println(""" <value name="$v"/>""") |
|
168 |
} |
|
169 |
masterwriter.println(""" </edgelabel>""") |
|
170 |
masterwriter.flush() |
|
171 |
masterwriter.println(""" <secedgelabel>""") |
|
172 |
for (def v : secedges) { |
|
173 |
v = v.replace("<", "<").replace(">", ">") |
|
174 |
masterwriter.println(""" <value name="$v"/>""") |
|
175 |
} |
|
176 |
masterwriter.println(""" </secedgelabel>""") |
|
177 |
masterwriter.flush() |
|
178 |
masterwriter.println(""" |
|
179 |
</annotation> |
|
180 |
</head> |
|
181 |
<body>""") |
|
182 |
for (def xmlFileName : getTXMFilesOrder()) { |
|
183 |
masterwriter.println(""" <subcorpus external="file:${xmlFileName}" name="${xmlFileName}"/>/>""") |
|
184 |
} |
|
185 |
masterwriter.flush() |
|
186 |
masterwriter.println(""" </body> |
|
187 |
</corpus> |
|
188 |
""") |
|
189 |
masterwriter.flush() |
|
190 |
masterwriter.close() |
|
191 |
} |
|
192 |
TIGERSearchEngine.buildTIGERCorpus(tigerXmlSrcDir, this.binaryDirectory, corpusName); |
|
47 | 193 |
} |
48 | 194 |
} |
49 | 195 |
|
50 |
ArrayList<String> orderedFiles = null; |
|
51 |
protected ArrayList<String> getTXMFilesOrder() { |
|
52 |
orderedFiles; |
|
53 |
} |
|
196 |
// ArrayList<String> orderedFiles = null; |
|
197 |
protected ArrayList<String> getTXMFilesOrder() { |
|
198 |
String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME); |
|
199 |
File driverFile = new File(binaryDirectory, "src/"+driverFilename) |
|
200 |
if (driverFile.exists()) { |
|
201 |
ArrayList<String> orderedFiles = new ArrayList<String>(); |
|
202 |
for (def s : new XmlSlurper().parse(driverFile).body.subcorpus) { |
|
203 |
String name = ""+s.@external |
|
204 |
if (name.startsWith("file:")) { |
|
205 |
orderedFiles << name.substring(5); |
|
206 |
} |
|
207 |
} |
|
208 |
return orderedFiles |
|
209 |
} else { |
|
210 |
return super.getTXMFilesOrder(); |
|
211 |
} |
|
212 |
} |
|
54 | 213 |
} |
TXM/trunk/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 3306) | ||
---|---|---|
336 | 336 |
"log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n"); |
337 | 337 |
|
338 | 338 |
BasicConfigurator.configure(); |
339 |
File master = new File(sourceDirectory, "main.xml"); |
|
339 |
|
|
340 |
String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME); |
|
341 |
|
|
342 |
File master = new File(sourceDirectory, driverFilename); |
|
340 | 343 |
if (!master.exists()) { |
341 | 344 |
File[] xmlFiles = sourceDirectory.listFiles(new FileFilter() { |
342 | 345 |
public boolean accept(File file) { |
Formats disponibles : Unified diff