Révision 3306

TXM/trunk/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TSImporter.groovy (revision 3306)
6 6
import org.txm.importer.xtz.*
7 7
import org.txm.metadatas.Metadatas
8 8
import org.txm.scripts.importer.xtz.*
9
import org.txm.tigersearch.preferences.TigerSearchPreferences
9 10
import org.txm.utils.BundleUtils
10 11
import org.txm.utils.FileUtils
11 12
import org.txm.utils.io.FileCopy
......
42 43
			Toolbox.getPreference(TBXPreferences.METADATA_TXTSEPARATOR), 1)
43 44
		}
44 45

  
45

  
46 46
		File sourceDirectory = inputDirectory
47 47
		File binaryDirectory = module.getBinaryDirectory()
48

  
49
		File master = new File(sourceDirectory, "main.xml")
50 48
		
49
		String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME);
50
		String headerFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.HEADER_FILENAME);
51
		
52
		File master = new File(sourceDirectory, driverFilename)
53
		
51 54
		def xmlFiles = [] // the TIGER XML files
52
		if (!master.exists() ) {
53
			println "No main.xml ($master) file found, trying to find a proper TIGER XML file."
55
		if (true || !master.exists() ) { // Managing the main.xml file is a pain, we'll do it later
56
			//println "No main.xml ($master) file found, trying to find a proper TIGER XML file."
54 57
//			master = new File(sourceDirectory, "main.xml")
55 58
//			String subcorpora = "";
56 59
			xmlFiles = sourceDirectory.listFiles(new FileFilter() {
......
60 63
							String filename = file.getName()
61 64
							if (filename.equals("import.xml")) return false;
62 65
							if (!filename.endsWith(".xml")) return false;
66
							if (filename.equals(driverFilename)) return false;
67
							if (filename.equals(headerFilename)) return false;
63 68
							
64 69
							return true;
65 70
						}
66 71
					});
67 72
			
68
			if (xmlFiles.size() > 1) {
69
				println "Error, the source directory contains more than one TIGER XML file ?"
70
				isSuccessFul = false;
71
				return;
72
			}
73
			if (xmlFiles.size() == 0) {
74
				println "Error no XML file found in $sourceDirectory directory"
75
				isSuccessFul = false;
76
				return;
77
			}
78
			println "Using ${xmlFiles.get(0)} as TIGER XML source file."
73
			println "Using ${xmlFiles} as TIGER XML source files."
79 74
			
80 75
		} else { // parse the master file
76
			println "Using the TIGER MAIN file: $master"
77
			
78
			
81 79
			for (def s : new XmlSlurper().parse(master).body.subcorpus) {
82 80
				String name = ""+s.@external
83 81
				if (name.startsWith("file:")) {
......
101 99
		File tsXSLFile = new File(Toolbox.getTxmHomePath(), "xsl/ts.xsl");
102 100
		BundleUtils.copyFiles("org.txm.tigersearch.rcp", "groovy", "org/txm/scripts/importer/tigersearch", "ts.xsl", tsXSLFile.getParentFile());
103 101

  
104
		File xmltxmSrcDir = new File(binaryDirectory, "src"); // output directory of the TS XSL transformation
105
		xmltxmSrcDir.mkdirs();
106
		if (master.exists()) println "TIGER MAIN file: $master" 
107
		println "TIGER XML files: $xmlFiles"
102
		File tigerXmlSrcDir = new File(binaryDirectory, "src"); // output directory of the TS XSL transformation
103
		tigerXmlSrcDir.deleteDir()
104
		tigerXmlSrcDir.mkdirs();
105
		
106
		println "TIGER-XML files: $xmlFiles"
108 107
		for (File xmlTigerFile : xmlFiles) {
109
			FileCopy.copy(xmlTigerFile, new File(xmltxmSrcDir, xmlTigerFile.getName()));
108
			FileCopy.copy(xmlTigerFile, new File(tigerXmlSrcDir, xmlTigerFile.getName()));
110 109
		}
111 110

  
112 111
		File tokenizedDir = new File(module.getBinaryDirectory(),"tokenized");
113 112
		
114
		if (!ApplyXsl2.processImportSources(tsXSLFile, xmltxmSrcDir, tokenizedDir)) {
115
			println "Error while applying TS XSL file to $xmltxmSrcDir"
113
		if (!ApplyXsl2.processImportSources(tsXSLFile, tigerXmlSrcDir, tokenizedDir)) {
114
			println "Error while applying TS XSL file to $tigerXmlSrcDir"
116 115
			isSuccessFul = false;
117 116
			return;
118 117
		}
119 118

  
120 119
		File[] files = tokenizedDir.listFiles(IOUtils.HIDDENFILE_FILTER);
121 120
		if (files == null || files.length == 0) {
122
			println "Error while applying TS XSL file to $xmltxmSrcDir is empty"
121
			println "Error while applying TS XSL file to $tigerXmlSrcDir is empty"
123 122
			isSuccessFul = false;
124 123
			return;
125 124
		}
126 125

  
127
		if (!doToXMLTXMStep()) return;
128
		if (!doInjectMetadataStep()) return;
126
		if (!doToXMLTXMStep()) return; // build the XML-TXM files
127
		
128
		if (!doInjectMetadataStep()) return; // inject the metadata in the XML-TXM files
129 129

  
130
		module.orderedFiles = new ArrayList<String>();
131
		for (def f : xmlFiles) {
132
			String id = FileUtils.stripExtension(f)
133
			module.orderedFiles.add(id)
134
		}
135 130
		isSuccessFul = outputDirectory.listFiles(IOUtils.HIDDENFILE_FILTER).size() > 0
136 131
		
137 132
		String cleanDirectories = project.getCleanAfterBuild();
TXM/trunk/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TSImport.groovy (revision 3306)
17 17
import org.txm.importer.xtz.*
18 18
import org.txm.scripts.importer.xtz.*
19 19
import org.txm.searchengine.ts.TIGERSearchEngine
20
import org.txm.tigersearch.preferences.TigerSearchPreferences
20 21

  
21 22
class TSImport extends XTZImport {
22 23
	
......
43 44
		super.start(); // call the usual XTZ import
44 45
		
45 46
		if (isSuccessful) {
46
			TIGERSearchEngine.buildTIGERCorpus(this.sourceDirectory, this.binaryDirectory, corpusName);
47
			
48
			File tigerXmlSrcDir = new File(binaryDirectory, "src")
49
			
50
			String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME);
51
			
52
			File master = new File(tigerXmlSrcDir, driverFilename)
53
			
54
			File srcmaster = new File(sourceDirectory, driverFilename)
55
			if (srcmaster.exists()) {
56
				println "Using source TIGER driver file: $srcmaster -> $master"
57
				FileCopy.copy(srcmaster, master)
58
			} else {
59
				println "Building TIGER driver file: $master..."
60
				String headerFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.HEADER_FILENAME);
61
				File header = new File(sourceDirectory, headerFilename)
62
				if (!header.exists()) {
63
					// parse TIGER-XML files to buidl a default header file
64
				}
65
				
66
				//build the master file using the TIGER-XML files of sourceDirectory
67
				// if necesary converts a driver (with <corpus>) file into a subcorpus (with <subcorpus>file
68
				def xmlFiles = tigerXmlSrcDir.listFiles()
69
				
70
				HashMap<String, HashSet<String>> tfeatures = new HashMap<String, HashSet<String>>()
71
				HashMap<String, HashSet<String>> ntfeatures = new HashMap<String, HashSet<String>>()
72
				HashSet<String> edges = new HashSet<String>()
73
				HashSet<String> secedges = new HashSet<String>()
74
				for (def xmlFile : xmlFiles) {
75
					//println xmlFile
76
					def doc = new XmlSlurper().parse(xmlFile)
77
					def terminals = null
78
					
79
					if (doc.name() == "corpus") terminals = doc.body.s.graph.terminals
80
					else terminals = doc.s.graph.terminals
81
					
82
					for (def terminal : doc.s.graph.terminals) {
83
						
84
						for (def t : terminal.t) {
85
							def attributes = t.attributes()
86
							for (def a : attributes.keySet()) {
87
								if (a == "id") continue;
88
								
89
								if (!tfeatures.containsKey(a)) {
90
									tfeatures[a] = new HashSet<String>();
91
								}
92
								
93
								tfeatures[a].add(""+attributes[a])
94
								
95
							}
96
						}
97
					}
98
					
99
					def nonterminals = null
100
					if (doc.name() == "corpus") nonterminals = doc.body.s.graph.nonterminals
101
					else nonterminals = doc.s.graph.nonterminals
102
					
103
					for (def nterminal : nonterminals) {
104
						
105
						for (def nt : nterminal.nt) {
106
							def attributes = nt.attributes()
107
							for (def a : attributes.keySet()) {
108
								if (a == "id") continue;
109
								
110
								if (!ntfeatures.containsKey(a)) {
111
									ntfeatures[a] = new HashSet<String>();
112
								}
113
								
114
								ntfeatures[a].add(""+attributes[a])
115
								
116
							}
117
							for (def edge : nt.edge) {
118
								edges.add(""+edge.@label)
119
							}
120
							for (def secedge : nt.secedge) {
121
								secedges.add(""+secedge.@label)
122
							}
123
						}
124
					}
125
					
126
					if (doc.name() == "corpus") { // convert file to a subcorpus file
127
						doc.name = "subcorpus"
128
						doc.head = {}
129
					}
130
				}
131
				
132
				def masterwriter = IOUtils.getWriter(master, "UTF-8")
133
				masterwriter.println("""<?xml version="1.0" encoding="utf-8"?>
134
<corpus id="$corpusName">
135
 <head>
136
  <meta>
137
   <name>$corpusName</name>
138
   <author>TXM</author>
139
   <date></date>
140
   <description>default master file generated by TXM for the TIGER import module</description>
141
   <format>TIGER-XML</format>
142
   <history></history>
143
  </meta>
144
  <annotation>""")
145
				for (def f : tfeatures.keySet()) {
146
					masterwriter.println("""   <feature domain="T" name="$f">""")
147
					for (def v : tfeatures[f]) {
148
						v = v.replace("<", "&lt;").replace(">", "&gt;")
149
						masterwriter.println("""<value name="$v"/>""")
150
					}
151
					masterwriter.println("""   </feature>""")
152
				}
153
				masterwriter.flush()
154
				for (def f : ntfeatures.keySet()) {
155
					masterwriter.println("""   <feature domain="NT" name="$f">""")
156
					for (def v : ntfeatures[f]) {
157
						v = v.replace("<", "&lt;").replace(">", "&gt;")
158
						masterwriter.println("""    <value name="$v"/>""")
159
					}
160
					masterwriter.println("""   </feature>""")
161
				}
162
				masterwriter.flush()
163
				
164
				masterwriter.println("""   <edgelabel>""")
165
				for (def v : edges) {
166
					v = v.replace("<", "&lt;").replace(">", "&gt;")
167
					masterwriter.println("""    <value name="$v"/>""")
168
				}
169
				masterwriter.println("""   </edgelabel>""")
170
				masterwriter.flush()
171
				masterwriter.println("""   <secedgelabel>""")
172
				for (def v : secedges) {
173
					v = v.replace("<", "&lt;").replace(">", "&gt;")
174
					masterwriter.println("""    <value name="$v"/>""")
175
				}
176
				masterwriter.println("""   </secedgelabel>""")
177
				masterwriter.flush()
178
				masterwriter.println("""
179
  </annotation>
180
 </head>
181
 <body>""")
182
				for (def xmlFileName : getTXMFilesOrder()) {
183
					masterwriter.println("""  <subcorpus external="file:${xmlFileName}" name="${xmlFileName}"/>/>""")
184
				}
185
				masterwriter.flush()
186
				masterwriter.println(""" </body>
187
</corpus>
188
""")
189
masterwriter.flush()
190
masterwriter.close()
191
			}
192
			TIGERSearchEngine.buildTIGERCorpus(tigerXmlSrcDir, this.binaryDirectory, corpusName);
47 193
		}
48 194
	}
49 195
	
50
	ArrayList<String> orderedFiles = null;
51
	protected ArrayList<String> getTXMFilesOrder() {
52
		orderedFiles;
53
	}
196
	//	ArrayList<String> orderedFiles = null;
197
		protected ArrayList<String> getTXMFilesOrder() {
198
			String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME);
199
			File driverFile = new File(binaryDirectory, "src/"+driverFilename)
200
			if (driverFile.exists()) {
201
				ArrayList<String> orderedFiles = new ArrayList<String>();
202
				for (def s : new XmlSlurper().parse(driverFile).body.subcorpus) {
203
					String name = ""+s.@external
204
					if (name.startsWith("file:")) {
205
						orderedFiles << name.substring(5);
206
					}
207
				}
208
				return orderedFiles
209
			} else {
210
				return super.getTXMFilesOrder();
211
			}
212
		}
54 213
}
TXM/trunk/org.txm.tigersearch.rcp/src/org/txm/searchengine/ts/TIGERSearchEngine.java (revision 3306)
336 336
				"log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n");
337 337
		
338 338
		BasicConfigurator.configure();
339
		File master = new File(sourceDirectory, "main.xml");
339
		
340
		String driverFilename = TigerSearchPreferences.getInstance().getString(TigerSearchPreferences.DRIVER_FILENAME);
341
		
342
		File master = new File(sourceDirectory, driverFilename);
340 343
		if (!master.exists()) {
341 344
			File[] xmlFiles = sourceDirectory.listFiles(new FileFilter() {
342 345
						public boolean accept(File file) {

Formats disponibles : Unified diff