Révision 1000

tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/TigerSubcorpus2Main.groovy (revision 1000)
1
/*
2
Combines subcorpus files to give a single main file.
3
 */
4
// TXM package statement
5
package org.txm.scripts.importer.srcmf
6

  
7
// Command-line entry point
8

  
9
def cli = new CliBuilder(
10
    usage:'TigerSubcorpus2Main.groovy [options] tiger_master.xml'
11
)
12
cli.h(longOpt:'help', 'Prints this message.')
13
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
14
options = cli.parse(args)
15
if (options.arguments().size() != 1) {
16
    println 'Incorrect number of command line arguments... exiting'
17
    println cli.usage()
18
    System.exit(2)
19
}
20
def tigerMaster = new File(options.arguments()[0])
21
def outputFile = new File('outputfile.xml')
22
if (options.o) {
23
    outputFile = new File(options.o)
24
}
25
script(tigerMaster, outputFile)
26

  
27
def script (File tigerMaster, File outputFile) {
28
    def masterDoc = new XmlParser().parse(tigerMaster)
29
    for (def body : masterDoc.body) {
30
	for (def subNode : body.subcorpus) {
31
		File subCorpusFile = new File(tigerMaster.getParentFile(), (""+subNode.@external).substring(5))
32
		def subDoc = new XmlParser().parse(subCorpusFile)
33
		body.remove(subNode)
34
		for (def child : subDoc.children())
35
			body.append(child)
36
	}
37
    }
38
    def strWriter = new StringWriter()
39
    new groovy.util.XmlNodePrinter(new PrintWriter(strWriter)).print(masterDoc)
40
    def rez = strWriter.toString()
41
    outputFile.withWriter("UTF-8") { writer ->
42
	writer.println('<?xml version="1.0" encoding="UTF-8"?>')
43
	writer.print(rez)
44
    }
45
}
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/package.html (revision 1000)
1
<html>
2
<body>
3
<p>TIGERSearch import module. This is a prototype that can only manage SRCMF TIGERSearch sources</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/MasterReheader.groovy (revision 1000)
1
#! /usr/bin/groovy
2
package org.txm.scripts.importer.srcmf;
3

  
4
/*
5
 * To change this template, choose Tools | Templates
6
 * and open the template in the editor.
7
 */
8

  
9
// Set up globals
10
// def masterFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos.xml')
11
// def outputFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos2.xml')
12
// def headerFile = new File('/home/tomr/Documents/Work/SRCMF/srcmf_ts/header_srcmf.xml')
13
// def feats = [nt:['cat', 'type', 'coord'], t:['pos', 'form', 'q']]
14
// def firstFeat = [t:'word', nt:'cat']
15

  
16
// Command-line entry point
17
def cli = new CliBuilder(
18
    usage:'MasterReheader.groovy [options] master_file.xml header_file.xml'
19
)
20
cli.h(longOpt:'help', 'Prints this message.')
21
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
22
cli.nt(longOpt:'ntfeats', args:1, argName:'ntfeats', 'NT features for which to provide value node.')
23
cli.t(longOpt:'tfeats', args:1, argName:'tfeats', 'T features for which  to provide value node.')
24
cli.nt1(longOpt:'ntfeat1st', args:1, argName:'1st ntfeat', 'First NT feature listed in header.')
25
cli.t1(longOpt:'tfeat1st', args:1, argName:'1st tfeat', 'First T feature listed in header.')
26
options = cli.parse(args)
27
if (options.arguments().size() == 2) {
28
    def masterFile = new File(options.arguments()[0])
29
    def headerFile = new File(options.arguments()[1])
30
    def masterFolder = masterFile.getCanonicalFile().getParent()
31
    def outputFile = null
32
    if (options.o) {
33
        outputFile = new File(options.o)
34
    } else {
35
        outputFile = new File(masterFolder, 'MasterReheader_out.xml')
36
    }
37
    def ntfirst = 'cat'
38
    if (options.nt1) {
39
        ntfirst = options.nt1
40
    }
41
    def tfirst = 'word'
42
    if (options.t1) {
43
        tfirst = options.t1
44
    }
45
    script(
46
        masterFile, headerFile, outputFile, 
47
        ['nt':options.nts, 't':options.ts],
48
        ['nt':options.nt1, 't':options.t1]
49
    )    
50
} else {
51
    println 'Incorrect number of command line arguments... exiting'
52
    println cli.usage()
53
}
54

  
55
def script(
56
    File masterFile, File headerFile, File outputFile, HashMap feats, HashMap firstFeat
57
) {
58
    // Load master and header files
59
    def master = new XmlParser().parse(masterFile)
60
    def header = new XmlParser().parse(headerFile)
61
    def masterFolder = masterFile.getCanonicalFile().getParent()
62

  
63
    // Set up locals
64
    def attrVal = [nt:[:], t:[:]]
65

  
66
    // Scan subcorpus files and build attribute lists.
67
    master.body.subcorpus.each { 
68
        def subcorpusFile = new File (masterFolder, it.'@external'[5..-1])
69
        def subcorpus = new XmlParser().parse(subcorpusFile)
70
        // Closure for t & nt nodes processing.
71
        def getvals = { node, type ->
72
            node.attributes().each { mEntry ->
73
                if (! attrVal[type].keySet().contains(mEntry.getKey())) {
74
                    attrVal[type][mEntry.getKey()] = new HashSet()
75
                }
76
                attrVal[type][mEntry.getKey()].add(mEntry.getValue())
77
            }
78
        }
79
        subcorpus.s.graph.terminals.t.each { getvals.call(it, 't') }
80
        subcorpus.s.graph.nonterminals.nt.each { getvals.call(it, 'nt') }
81
    }
82
    // Id isn't an attribute in the header.
83
    attrVal['t'].remove('id')
84
    attrVal['nt'].remove('id')
85
    // Remove old feature nodes in master file
86
    def oldFeatureNodes = master.head.annotation.feature
87
    while (oldFeatureNodes) {
88
        node = oldFeatureNodes.pop()
89
        node.parent().remove(node)
90
    }
91
    assert (! master.head.annotation.feature)
92
    // Check firstFeat was relevant
93
    ['t', 'nt'].each { type ->
94
        if (! (attrVal[type].keySet().contains(firstFeat[type]))) {
95
            firstFeat[type] = attrVal[type].keySet().sort()[0]
96
        }
97
    }
98
    assert attrVal['t'].keySet().contains(firstFeat['t'])
99
    assert attrVal['nt'].keySet().contains(firstFeat['nt'])
100
    def featList = [:]
101
    ['t', 'nt'].each { type ->
102
        featList[type] = [firstFeat[type]]
103
        featList[type].addAll(attrVal[type].keySet().findAll { it != firstFeat[type] })
104
    }
105
    // Add new feature and value nodes
106
    ['t', 'nt'].each { type ->
107
        featList[type].each { feat ->
108
            def fNode = new Node(master.head.annotation[0], 'feature', 
109
                ['domain':type.toUpperCase(), 'name':feat]
110
            )
111
            // Add value node if the node value is given in 'feats'
112
            if (feats[type].contains(feat)) {
113
                attrVal[type][feat].each { value ->
114
                    assert header.'**'.feature
115
                    assert header.'**'.feature[0].'@name'
116
                    assert header.'**'.feature[0].'@domain'
117
                    assert ['NT', 'T'].contains(header.'**'.feature[0].'@domain')
118
                    def hFNode = header.'**'.feature.find {
119
                        it.'@name' == feat && (
120
                            it.'@domain' == type.toUpperCase() || it.'@domain' == 'FREC'
121
                        )
122
                    }
123
                    def vText = '[unknown]'
124
                    if (hFNode && hFNode.value.find { it.'@name' == value }) {
125
                        vText = hFNode.value.find { it.'@name' ==  value }.text()
126
                    }
127
                    new Node(fNode, 'value', ['name':value], vText)
128
                }
129
            }
130
        }    
131
    }
132

  
133
    // Save to output_file
134
    outputFile.withWriter { writer ->
135
        writer << groovy.xml.XmlUtil.serialize(master)
136
    }
137
}
138

  
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/SrcmfImporter.groovy (revision 1000)
1
/*
2
 * Calls all import scripts
3
 */
4

  
5
/**
6
 *
7
 * @author tmr
8
 */
9

  
10
// Command line form of import statements:
11
// import MasterReheader
12
// import PunctInjectImport
13
// import SubcorpusDataInject
14
// import TigerSubcorpus2Main
15
// import java.util.logging.FileHandler
16
// import javax.xml.parsers.DocumentBuilderFactory
17

  
18
// TXM package statement
19
package org.txm.scripts.importer.srcmf
20

  
21
import java.util.logging.*;
22

  
23

  
24
// Command line entry point
25
def cli = new CliBuilder(
26
    usage:'SrcmfImport.groovy [options] tiger_master.xml xml_txm.xml header_file.xml'
27
)
28
cli.h(longOpt:'help', 'Prints this message.')
29
options = cli.parse(args)
30
if (options.arguments().size() != 3) {
31
    println 'Incorrect number of command line arguments... exiting'
32
    println cli.usage()
33
    System.exit(2)
34
}
35

  
36
def tigerFile = new File(options.arguments()[0])
37
def txmFile = new File(options.arguments()[1])
38
def headerFile = new File(options.arguments()[2])
39
def tigerXmlAll = doAllButPnc(
40
    tigerFile, 
41
    txmFile, 
42
    headerFile,
43
    txmFile.getAbsoluteFile().getParentFile().getParentFile()
44
)
45
doPnc(tigerXmlAll, txmFile)
46
tigerXmlAll.delete()
47

  
48
def doAllButPnc(File tigerFile, File txmFile, File headerFile, File binDir) {
49
    // Run pos injection script
50
    File txmSrcDir = txmFile.getAbsoluteFile().getParentFile()
51
    File tigerDir = new File(binDir, "tiger")
52
    tigerDir.mkdir()
53
    File masterpos = new File(tigerDir, "master_pos.xml")
54
    File xmltxm = txmSrcDir.listFiles()[0]
55
    File logFile = new File(binDir, "tiger.log")
56
    def sdi = new SubcorpusDataInject(
57
        xmltxm, 
58
        new FileHandler(logFile.getAbsolutePath()), "vers"
59
    )
60
    sdi.processMaster(tigerFile, masterpos)
61
    // Run reheader script
62
    def reheader = new MasterReheader()
63
    File tmp = File.createTempFile("tmp", ".xml",tigerDir)
64
    def feats = ['nt':['cat', 'type', 'coord'], 't':['pos', 'form', 'q']]
65
    def firstFeat = ['nt':'cat', 't':'word']
66
    reheader.script(masterpos, headerFile, tmp, feats, firstFeat)
67
    if (!tmp.exists()) {
68
    	println "Error: reheader failed"
69
    }
70
    masterpos.delete()
71
    tmp.renameTo(masterpos)
72
    // Run merge master & subcorpus script
73
    def tigerXmlAll = new File(masterpos.getParentFile(), "TigerAll.xml")
74
    def mergescript = new TigerSubcorpus2Main()
75
    mergescript.script(masterpos, tigerXmlAll)
76
    return tigerXmlAll
77
}
78

  
79
def doPnc(File tigerXmlAll, File txmFile) {
80
    injector = new PunctInjectImport(tigerXmlAll, txmFile)
81
    injector.outputFile = new File(tigerXmlAll.getParentFile(), "TigerPnc.xml")
82
    injector.process()
83
}
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/SubcorpusDataInject.groovy (revision 1000)
1
#! /usr/bin/groovy
2
package org.txm.scripts.importer.srcmf;
3
/*
4
 * To change this template, choose Tools | Templates
5
 * and open the template in the editor.
6
 */
7

  
8
import java.util.logging.Logger
9
import java.util.logging.FileHandler
10
import java.util.logging.SimpleFormatter
11
import java.util.logging.Level
12

  
13

  
14
class SubcorpusDataInject {
15
	// Globals
16
	def textForm = 'prose'
17
	def TEI = null
18
	def wNodes = []
19
	def wNodesInQ = []
20
	// Set up namespaces
21
	static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
22
	static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm')
23
	// Set up a logger
24
	static logger = Logger.getLogger('subcorpusDataInjectLog')
25
	// END GLOBALS
26

  
27
	SubcorpusDataInject (
28
	File xmlTxmFile,
29
	FileHandler fh,
30
	String newTextForm
31
	) {
32
		// Reset global values
33
		if (newTextForm) {
34
			textForm = newTextForm
35
		}
36
		// Set up logger
37
		def formatter = new SimpleFormatter()
38
		logger.addHandler(fh)
39
		logger.setLevel(Level.ALL)
40
		fh.setFormatter(formatter)
41
		// Parse the XML-TXM file
42
		logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).")
43
		TEI = new XmlParser().parse(xmlTxmFile)
44
		// Get text form from TEI header (if expressed)
45
		logger.log(Level.INFO, "Reading TEI header to detect text form.")
46
		if (TEI.teiHeader.profileDesc.textClass.catRef) {
47
			def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1]
48
			textForm = TEI.teiHeader.encodingDesc.'**'.category.find {
49
				it.'@id' == catId || it.attribute(nsXml.id) == catId
50
			}.catDesc[0].text()
51
			logger.log(Level.INFO, "Found text form '$textForm' in TEI header.")
52
		} else {
53
			logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.")
54
		}
55
		// Convert Lbs to Property
56
		if (textForm == 'vers' || textForm == 'mixte') {
57
			logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.")
58
			def undone = TEI.text.body
59
			def lastLb = false
60
			def lastWord = null
61
			while (undone) {
62
				def aNode = undone.remove(0)
63
				if (aNode.name().getLocalPart() == 'lb') {
64
					lastLb = true
65
					if (lastWord) {
66
						lastWord.'@lb' = 'end'
67
					}
68
				} else if (aNode.name().getLocalPart() == 'w') {
69
					if (!punctTest(aNode)) {
70
						if (lastLb) {
71
							aNode.'@lb' = 'start'
72
						}
73
						lastWord = aNode
74
						lastLb = false
75
					}
76
				} else {
77
					undone.addAll(0, aNode.children().findAll { it.getClass() == Node })
78
				}
79
			}
80
		}
81
		// Build node lists
82
		logger.log(Level.INFO, "Building node lists.")
83
		wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' }
84
		wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' }
85
	}
86

  
87
	static main(def args) {
88
		// Parse command line args
89
		def outputFile = 'test-out.xml'
90
		def logName = 'subcorpusDataInjectLog.log'
91
		def cli = new CliBuilder(
92
				usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml'
93
				)
94
		cli.h(longOpt:'help', 'Prints this message')
95
		cli.m(longOpt:'master', 'Specifies input file is a TS master file.')
96
		cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.')
97
		cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.')
98
		cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file')
99
		println args
100
		def options = cli.parse(args)
101
		// Call XML TXM loader
102
		if (options.arguments().size() == 2) {
103
			def arglist = [new File (options.arguments()[0])]
104
			if (options.logfile) {
105
				arglist.add(new FileHandler(options.logfile))
106
			} else {
107
				arglist.add(new FileHandler(logName))
108
			}
109
			if (options.v) {
110
				arglist.add('vers')
111
			} else {
112
				arglist.add('')
113
			}
114
			def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2])
115
			// Call process master or process subcorpus
116
			arglist = [new File(options.arguments()[1]).getCanonicalFile()]
117
			if (options.output) {
118
				arglist.add(new File (options.output))
119
			} else {
120
				arglist.add(new File (outputFile))
121
			}
122
			if (options.m) {
123
				sdi.processMaster(arglist[0], arglist[1])
124
			} else {
125
				sdi.processSubcorpus(arglist[0], arglist[1])
126
			}
127
		} else {
128
			println "Wrong number of arguments."
129
			println cli.usage()
130
		}
131
	}
132

  
133
	def processMaster (File masterFile, File outputFile) {
134
		// Runs the transform on each subcorpus listed in a master file.
135
		// Resaves the master file.
136
		logger.log(Level.INFO, "Loading corpus master file")
137
		def masterFolder = ''
138
		def outputFolder = ''
139
		if (masterFile.getParent()) {
140
			// If not cwd
141
			masterFolder = masterFile.getParent()
142
		}
143
		if (outputFile.getParent()) {
144
			// If not cwd
145
			outputFolder = outputFile.getParent()
146
		}
147
		def master = new XmlParser().parse(masterFile)
148
		// Process each subcorpus file
149
		master.body.subcorpus.each {
150
			processSubcorpus(
151
					new File (masterFolder, it.'@external'[5..-1]),
152
					new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml")
153
					)
154
			it.'@external' = "${it.'@external'[0..-5]}_pos.xml"
155
		}
156
		logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}")
157

  
158
		outputFile.withWriter { writer ->
159
			writer << groovy.xml.XmlUtil.serialize(master)
160
		}
161
		logger.log(Level.INFO, "Run reheader on this file!")
162
	}
163

  
164
	def processSubcorpus (File subcorpusFile, File outputFile) {
165
		// Parse Subcorpus file
166
		logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}")
167
		def TS = new XmlParser().parse(subcorpusFile)
168
		// This is used as a stack
169
		def tNodes = TS.s.graph.terminals.t
170
		// This isn't
171
		def tNodesFixed = TS.s.graph.terminals.t
172

  
173
		// Build duplicata correspondance list
174
		logger.log(Level.INFO, "Building duplicata correspondance list")
175
		def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl ->
176
			def duplId = dupl.'@id'
177
			def realId = dupl.parent().parent().nonterminals.nt.edge.find {
178
				it.'@idref' == duplId
179
			}.parent().secedge.find {
180
				it.'@label' == 'dupl'
181
			}.'@idref'
182
			[duplId, realId]
183
		}
184
		logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.")
185

  
186
		// Add pos & form tags
187
		logger.log(Level.INFO, "Adding PoS and form tags to each t-node")
188
		// Get first wNode corresponding to a tNode
189
		def i = 0
190
		def wIx = null
191
		while (i < tNodes.size()) {
192
			// Avoid trying to find "fake" first words, it's very inefficient
193
			if (tNodes[i].'@word' != '#') {
194
				wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) })
195
			}
196
			if (wIx) {
197
				break
198
			} else {
199
				i++
200
			}
201
		}
202
		assert wIx, "$i"
203
		/*
204
		 * MAIN ITERATOR BEGINS HERE FOR POS INJECTION
205
		 */
206
		// Iterate over tNodes.
207
		while (tNodes) {
208
			def thisTNode = tNodes.remove(0)
209
			def thisTNodeId = thisTNode.'@editionId'
210
			def wNode = null
211
			def q = false
212
			// Define the PoS / q / form injecting closure
213
			def injectPos = {
214
				// Q tag
215
				if (q) {
216
					thisTNode.'@q' = 'y'
217
				} else {
218
					thisTNode.'@q' = 'n'
219
				}
220
				// Form tag
221
				if (textForm == 'prose') {
222
					thisTNode.'@form' = 'prose'
223
				} else if (
224
				textForm == 'vers' ||
225
				(
226
				textForm == 'mixte'
227
				&& wNode.parent().parent().name().getLocalPart() == 'ab'
228
				)
229
				) {
230
					if (! wNode.'@lb') {
231
						thisTNode.'@form' = 'vers'
232
					} else if (wNode.'@lb' == 'start') {
233
						thisTNode.'@form' = 'vers_debut'
234
					} else if (wNode.'@lb' == 'end')  {
235
						thisTNode.'@form' = 'vers_fin'
236
					}
237
				}
238
				// DEBUG TMR 22/08/2012
239
				else {
240
					thisTNode.'@form' = 'prose'
241
				}
242
				// END DEBUG TMR 22/08/2012
243
				// Find PoS tag in BFM file
244
				// Try txm:ana tags
245
				def txmAnaList = wNode[nsTxm.ana]
246
				// Then interp tags
247
				if (! txmAnaList) {
248
					txmAnaList = wNode.interp
249
				}
250
				// Try type="#pos_syn"
251
				if (txmAnaList.find {it.'@type' == '#pos_syn'}) {
252
					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text()
253
					// type attr of wNode
254
				} else if (! txmAnaList && wNode.'@type') {
255
					thisTNode.'@pos' = wNode.'@type'
256
					// type="#pos"
257
				} else if (txmAnaList.find { it.'@type' =  '#pos' }) {
258
					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text()
259
					// type="#fropos"
260
				} else if (txmAnaList.find { it.'@type' =  '#fropos' }) {
261
					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text()
262
				} else {
263
					thisTNode.'@pos' = '--'
264
				}
265
			}
266
			// End of injecting closure.
267
			// Check whether wNode matches tNode
268
			if (thisTNode.'@word' == '#') {
269
				// Not expecting to find these in BFM file, all OK.
270
				thisTNode.'@pos' = '--'
271
				thisTNode.'@form' = '--'
272
				thisTNode.'@q' = '--'
273
			} else if ( idmatch(thisTNode, wNodes[wIx]) ) {
274
				// Perfect match
275
				wNode = wNodes[wIx]
276
				q = wNodesInQ.contains(wNode)
277
				injectPos.call()
278
				wIx++
279
			} else if (thisTNode.'@word' == '*') {
280
				// Duplicata, use dupl match list to find correct wNode
281
				def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' }
282
				if (duplIdMatch) {
283
					def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] }
284
					// To enhance performance, I assume initially that the wNode matching the
285
					// duplicata is within +- 10 words of the current position in the
286
					// list.  The wNodes list is very long, and searches over the entire
287
					// list are avoided.
288
					assert matchTNode, "$duplIdMatch"
289
					if (wIx + 10 < wNodes.size() && wIx - 10 > 0) {
290
						wNode = wNodes[wIx-10 .. wIx+10].find {
291
							idmatch(matchTNode, it)
292
						}
293
					}
294
					// Use whole list if it failed.
295
					if (! wNode) {
296
						wNode = wNodes.find {
297
							idmatch(matchTNode, it)
298
						}
299
					}
300
					q = wNodesInQ.contains(wNode)
301
					injectPos.call()
302
					// Don't update wIx value!
303
				} else {
304
					thisTNode.'@pos' = '--'
305
					thisTNode.'@form' = '--'
306
					thisTNode.'@q' = '--'
307
					logger.log(
308
							Level.WARNING,
309
							"<w/> node matching TS duplicata $thisTNodeId not found."
310
							)
311
				}
312
			}  else if (punctTest(wNodes[wIx])) {
313
				// not expecting to find these in TS file; try same tNode again.
314
				tNodes.add(0, thisTNode)
315
				wIx++
316
			} else if (!
317
			wNodes.find {
318
				idmatch (thisTNode, it)
319
			}
320
			) {
321
				// Check the SRCMF corpus hasn't acquired an extra word
322
				logger.log(
323
						Level.WARNING,
324
						"TS word $thisTNodeId (${thisTNode.'@word'}) is " +
325
						"absent from BFM file."
326
						)
327
				thisTNode.'@pos' = '--'
328
				thisTNode.'@form' = '--'
329
				thisTNode.'@q' = '--'
330
			} else {
331
				// it's doesn't match, it's not a duplicata,
332
				// it's not BFM punctuation, but it IS found somewhere in the BFM
333
				// corpus.
334
				logger.log(
335
						Level.WARNING,
336
						"Discontinuity in TS corpus before word $thisTNodeId " +
337
						"(${thisTNode.'@word'})."
338
						)
339
				wNode = wNodes.find {
340
					idmatch(thisTNode, it)
341
				}
342
				q = wNodesInQ.contains(wNode)
343
				injectPos.call()
344
				wIx = wNodes.indexOf(wNode) + 1
345
			}
346
		}
347
		/*
348
		 * End Main iteration
349
		 */
350
		/*******************
351
		 * Recalculate headpos 
352
		 */
353
		logger.log(Level.INFO, "Recalculating headpos.")
354
		TS.s.graph.nonterminals.nt.each { nt ->
355
			def a = nt.edge.findAll { it.'@label' == 'L' }
356
			if (a.size() == 0) {
357
				nt.'@headpos' = '--'
358
			} else if (a.size() == 1) {
359
				nt.'@headpos' = nt.parent().parent().terminals.t.find {
360
					it.'@id' == a[0].'@idref'
361
				}.'@pos'
362
			} else {
363
				for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) {
364
					if (
365
					a.each { edge ->
366
						nt.parent().parent().terminals.t.find {
367
							it.'@id' == edge.'@idref'
368
						}.'@pos'
369
					}.contains(headpos)
370
					) {
371
						nt.'@headpos' = headpos + '?'
372
						break
373
					}
374
				}
375
			}
376
			assert nt.'@headpos'
377
		}
378
		logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}")
379

  
380
		outputFile.withWriter { writer ->
381
			writer << groovy.xml.XmlUtil.serialize(TS)
382
		}
383
	}
384
	// Subroutine self-test to establish whether a Groovy node is punctuation
385
	private punctTest(gNode) {
386
		if (gNode.'@type') {
387
			gNode.'@type'.toLowerCase().startsWith('pon')
388
		} else {
389
			if (gNode[nsTxm.ana]) {
390
				return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null
391
			} else if (gNode.interp) {
392
				return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null
393
			} else return false
394
		}
395
	}
396

  
397
	private idmatch (tNode, wNode) {
398
		// Matches a TS node with a <w/> node
399
		def idMatch = false
400
		def wNodeId = wNode.'@id'
401
		if (! wNodeId) {
402
			wNodeId = wNode.attribute(nsXml.id)
403
		}
404
		def tNodeId = tNode.'@editionId'
405
		def regex = /(\d+)([_a-z]+)?$/
406
		def m1 = (tNodeId =~ regex)
407
		def m2 = (wNodeId =~ regex)
408
		if ( m1.find() && m2.find() ) {
409
			idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger())
410
		} else {
411
			logger.log(
412
					Level.WARNING,
413
					"Idmatch routine can't find integer part of one or both of the" +
414
					"paired Ids for nodes $tNode, $wNode"
415
					)
416
		}
417
		if (! idMatch) {
418
			return false
419
		}
420
		
421
		//def wNodeForm = wNode.text() // Enable if treating old-style XML-BFM
422
		def wNodeForm = "";
423
		if (! wNodeForm) {
424
			wNodeForm = getDeepText(wNode[nsTxm.form][0])
425
		}
426
		def tNodeForm = tNode.'@word'
427
		def formMatch = (tNodeForm == wNodeForm)
428
		if (idMatch && formMatch) {
429
			return true
430
		} else {
431
			logger.log(Level.WARNING,
432
					"Matching IDs ($tNodeId) but unmatched forms: TS form '$tNodeForm', BFM form '$wNodeForm'.")
433
			return true
434
		}
435
	}
436
	
437
	def getDeepText(def node) {
438
		def str = ""
439
		node.children().each { child ->
440
			if (child instanceof Node) {
441
				str += getDeepText(child)
442
			} else if (child instanceof String) {
443
				str += child
444
			}
445
		}
446
		return str
447
	}
448
}
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/PunctInjectImport.groovy (revision 1000)
1
// Script to restore punctuation etc. to results of TS query.
2
// Inputs:
3
// --- Tiger-XML Document node
4
// --- Java array:
5
// --- --- String [index][type] where:
6
// --- --- --- type == 0 gives the xml:id
7
// --- --- --- type == 1 gives the word form
8
// Process:
9
// --- Injects punctuation.
10
// Returns:
11
// --- Tiger-XML Document node.
12

  
13
// TXM package statement
14
package org.txm.scripts.importer.srcmf
15

  
16
import javax.xml.parsers.DocumentBuilderFactory
17

  
18

  
19

  
20
/**
21
 * 
22
 * @author tmr
23
 *
24
 */
25
class PunctInjectImport
26
{
27

  
28
	static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
29
	static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei')
30
	def tigerXml = null
31
	def txmIdWordTableFixed = []
32
	def outputFile = null
33

  
34
	/*
35
	 * Call this constructor when no need of XML-TXM file for word ID table
36
	 */
37
	PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) {
38
		def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
39
		tigerXml = builder.parse(tigerXmlFile).documentElement
40
		txmIdWordTableFixed = txmIdWordTable
41
		outputFile = new File("output.xml")
42
	}
43

  
44
	/*
45
	 * Call this constructor when the XML-TXM file is needed for the word list.    
46
	 */
47
	PunctInjectImport(File tigerXmlFile, File xmlTxmFile) {
48
		//
49
		def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
50
		tigerXml = builder.parse(tigerXmlFile).documentElement
51
		txmIdWordTableFixed = []
52
		def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile)
53
		for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) {
54
			def anId = null
55
			anId = w.'@id'
56
			if (! anId) {
57
				anId = w.attribute(nsXml.id)
58
			}
59
			if (! anId) {
60
				println "Error: ID attribute not located in XML TXM file"
61
			}
62
			txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])]
63
		}
64
		// assert txmIdWordTableFixed.size() > 0
65
		outputFile = new File("output.xml")
66
	}
67

  
68
	def process () {
69
		def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed)
70
		def allTs = toList(this.tigerXml.getElementsByTagName('t'))
71
		def tAttrs = getTAttrNames(allTs[0])
72
		def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first()
73
		def addedWordIds = []
74
		int tIx = 0
75
		while (txmIdWordTable) {
76
			def word = txmIdWordTable.remove(0)
77
			if (tIx == allTs.size()) {
78
				// End of TS file, but still words left in the BFM file.
79
				addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append')
80
			}
81
			else {
82
				def tNode = allTs[tIx]
83
				def tId = getTNodeId(tNode)
84
				if (tId == word[0] && tNode.getAttribute('word') == word[1]) {
85
					// alles gut
86
					tIx += 1
87
				}
88
				else if (tId == word[0]) {
89
					println("Mismatched Ids! ($tId)")
90
					tIx += 1
91
				}
92
				else if (['#', '*'].contains(tNode.getAttribute('word')) ) {
93
					// SRCMF duplicata; try comparing word against the next tNode next time
94
					// around.
95
					txmIdWordTable.add(0, word)
96
					tIx += 1
97
				}
98
				// Check that the SRCMF corpus doesn't have a bug in it...
99
				else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
100
				&& (allTs[0..tIx - 1].find{
101
					it.getAttribute('id') == "$tNodeIdPrefix#$tId"
102
				})) {
103
					println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
104
					txmIdWordTable.add(0, word)
105
					tIx += 1
106
				}
107
				// Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change)
108
				else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
109
				&& (allTs[tIx..-1].find{
110
					it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}"
111
				})) {
112
					println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
113
					txmIdWordTable.add(0, word)
114
					tIx += 1
115
				}
116
				else if (addedWordIds.contains(tId)) {
117
					println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
118
					txmIdWordTable.add(0, word)
119
					tIx += 1
120
				}
121
				else {
122
					// Insert word.  In the first instance, it will have the same parent as
123
					// the tNode before which it's being inserted.
124
					addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before')
125
					addedWordIds.add(word[0])
126
				}
127
			}
128
		}
129
		// Second phase: move punctuation into previous sentence,
130
		// dependent on sequence.
131
		def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals'))
132
		for (def i = 1 ; i < allTerminalses.size() ; i++) {
133
			def ts = toList(allTerminalses[i].getElementsByTagName('t'))
134
			def startPunc = true
135
			def puncStack = []
136
			while (ts && startPunc) {
137
				if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) {
138
					puncStack.add(ts.remove(0))
139
				}
140
				else {
141
					startPunc = false
142
				}
143
			}
144
			// Now, treat the punctuation stack at the beginning of the sentence
145
			if ( puncStack ) {
146
				int moveLeft = 0
147
				// First, identify LAST instance of sentence-final punctuation.
148
				def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('')
149
				def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/
150
				if (matches.size() > 0) {
151
					moveLeft = puncString.lastIndexOf(matches[-1]) + 1
152
				}
153
				// Second, split pairs of straight quotes
154
				matches = puncString =~ /(""|'')/ //"
155
				if (matches.size() > 0) {
156
					moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max()
157
				}
158
				// Now, move moveLeft punctuation nodes to the end of the prev. sentence
159
				ts = toList(allTerminalses[i].getElementsByTagName('t'))
160
				for (def j = 0 ; j < moveLeft ; j++ ) {
161
					allTerminalses[i - 1].appendChild(ts[j])
162
				}
163
			}
164
		}
165
		outputFile.withWriter("UTF-8") { writer ->
166
			writer.println(this.tigerXml)
167
		}
168
	}
169

  
170
	private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
171
		def newTNode = tNode.getOwnerDocument().createElement('t')
172
		for (def anAttr : tAttrs) {
173
			if (anAttr == 'id') {
174
				newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}")
175
			}
176
			else if (anAttr == 'word') {
177
				newTNode.setAttribute('word', word[1])
178
			}
179
			else {
180
				newTNode.setAttribute(anAttr, '--')
181
			}
182
		}
183
		if (where == 'before') {
184
			tNode.getParentNode().insertBefore(newTNode, tNode)
185
		}
186
		else if (where == 'append') {
187
			tNode.getParentNode().appendChild(newTNode)
188
		}
189
		else {
190
			throw new IllegalArgumentException('Bad before value')
191
		}
192
	}
193
	def getTAttrNames(tNode) {
194
		def nodeMap = tNode.attributes
195
		def nameList = []
196
		for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) {
197
			nameList.add( nodeMap.item(i).nodeName )
198
		}
199
		return nameList
200
	}
201
	def getTNodeId(tNode) {
202
		return tNode.getAttribute('id').tokenize('#').last()
203
	}
204
	def toList(def iterable) {
205
		return iterable.findAll {true};
206
	}
207
	def getDeepText(def node) {
208
		def str = ""
209
		node.children().each { child ->
210
			if (child instanceof Node) {
211
				str += getDeepText(child)
212
			} else if (child instanceof String) {
213
				str += child
214
			}
215
		}
216
		return str
217
	}
218

  
219
	public static void main(String[] args)
220
	{
221
		def cli = new CliBuilder(
222
				usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml'
223
				)
224
		cli.h(longOpt:'help', 'Prints this message.')
225
		cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
226
		def options = cli.parse(args)
227
		def tigerXmlFile = null
228
		def xmlTxmFile = null
229
		if (options.arguments().size() == 2) {
230
			tigerXmlFile = new File(options.arguments()[0])
231
			xmlTxmFile = new File(options.arguments()[1])
232
		} else {
233
			println 'Incorrect number of command line arguments... exiting'
234
			println cli.usage()
235
			System.exit(2)
236
		}
237
		def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile)
238
		if (options.o) {
239
			pii.outputFile = new File(options.o)
240
		}
241
		pii.process()
242
	}
243
}
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/srcmfLoader.groovy (revision 1000)
1
package org.txm.scripts.importer.srcmf
2

  
3
// srcdir
4
// txml/master.xml
5
// txml/*.xml
6
// txm/*.xml
7

  
8
// 1 import xml-txm
9
import ims.tiger.gui.tigerregistry.TIGERRegistry;
10
import ims.tiger.index.writer.IndexBuilderErrorHandler;
11
import ims.tiger.index.writer.SimpleErrorHandler;
12
import ims.tiger.index.writer.XMLIndexing;
13
import ims.tiger.system.*;
14

  
15
import javax.xml.stream.*;
16

  
17
import java.io.BufferedWriter;
18
import java.io.File;
19
import java.io.FileOutputStream;
20
import java.io.IOException;
21
import java.io.OutputStreamWriter;
22
import java.io.PrintStream;
23
import java.net.URL;
24
import java.util.HashMap;
25
import java.util.logging.FileHandler
26
import java.io.File;
27

  
28
import org.apache.log4j.BasicConfigurator;
29
import org.txm.export.ts.*;
30
import org.txm.importer.ApplyXsl2;
31
import org.txm.utils.xml.DomUtils;
32
import org.txm.importer.ValidateXml;
33
import org.txm.importer.XPathResult;
34
import org.txm.importer.xmltxm.compiler;
35
import org.txm.importer.xml.pager;
36
import org.txm.objects.*;
37
import org.txm.utils.*
38
import org.txm.utils.io.*;
39
import org.txm.*;
40
import org.txm.importer.xmltxm.*;
41
import org.txm.metadatas.*;
42
import org.txm.utils.i18n.*;
43
import org.w3c.dom.Element
44
import org.xml.sax.SAXException;
45

  
46
import javax.xml.parsers.DocumentBuilderFactory
47

  
48
String userDir = System.getProperty("user.home");
49
boolean debug = false
50
def MONITOR;
51
BaseParameters params;
52
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
53
{	println "DEV MODE";//exception means we debug
54
	debug = true
55
	params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml"))
56
	params.load()
57
	if (!org.txm.Toolbox.isInitialized()) {
58
		Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
59
		Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
60
		Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
61
		Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
62
		Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
63
		Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
64
		Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
65
	}
66
}
67
if (params == null) { println "no parameters. Aborting"; return; }
68

  
69
String corpusname = params.getCorpusName();
70
Element corpusElem = params.corpora.get(corpusname);
71
String basename = params.name;
72
String rootDir = params.rootDir;
73
String lang = corpusElem.getAttribute("lang");
74
String model = lang
75
String encoding = corpusElem.getAttribute("encoding");
76
boolean annotate = "true" == corpusElem.getAttribute("annotate");
77
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
78
def xslParams = params.getXsltParams(corpusElem);
79

  
80
File txmSrcDir = new File(rootDir, "xml-txm");
81
File tigerSrcDir = new File(rootDir, "tiger-xml");
82
File headerFile = new File(rootDir, "tiger-xml/header.xml");
83

  
84
File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename);
85

  
86
binDir.deleteDir();
87
binDir.mkdirs();
88
if (!binDir.exists()) {
89
	println "Could not create binDir "+binDir
90
	return;
91
}
92
if (!txmSrcDir.exists()) {
93
	println "Error: txm src dir does not exists: "+txmSrcDir
94
	return;
95
}
96
if (!tigerSrcDir.exists()) {
97
	println "Error: tiger src dir does not exists: "+tigerSrcDir
98
	return;
99
}
100
File txmDir = new File(binDir,"txm/$corpusname");
101
txmDir.deleteDir();
102
txmDir.mkdirs();
103
// copy txm files
104
List<File> srcfiles = txmSrcDir.listFiles();
105
for (File f : srcfiles) {// check XML format, and copy file into binDir
106
	if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties"))
107
		continue;
108
	if (ValidateXml.test(f)) {
109
		FileCopy.copy(f, new File(txmDir, f.getName()));
110
	} else {
111
		println "Won't process file "+f;
112
	}
113
}
114
if (txmDir.listFiles() == null) {
115
	println "No txm file to process"
116
	return;
117
}
118

  
119
List<File> filelist = txmDir.listFiles();
120
Collections.sort(filelist);
121

  
122
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
123
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
124
println "-- COMPILING - Building Search Engine indexes"
125
println "binDir: $binDir"
126
println "txmDir: $txmDir"
127
def c = new compiler();
128
if (debug) c.setDebug();
129
c.setLang(lang);
130
//c.setSortMetadata(sortMetadata)
131
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
132
	println "import process stopped";
133
	return;
134
}
135

  
136
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
137
if (MONITOR != null) MONITOR.worked(20, "EDITION")
138
println "-- EDITION - Building edition"
139
new File(binDir,"HTML/$corpusname").deleteDir();
140
new File(binDir,"HTML/$corpusname").mkdirs();
141
File outdir = new File(binDir,"/HTML/$corpusname/default/");
142
outdir.mkdirs();
143

  
144

  
145
def second = 0
146
println "Paginating text: "
147
for (File srcfile : filelist) {
148
	String txtname = srcfile.getName();
149
	int i = txtname.lastIndexOf(".");
150
	if(i > 0) txtname = txtname.substring(0, i);
151
	List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
152
	List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
153
	Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
154
	if (second) { print(", ") }
155
	if (second > 0 && (second++ % 5) == 0) println ""
156
	print(srcfile.getName());
157
	def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
158
	Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
159
	for (i = 0 ; i < ed.getPageFiles().size();) {
160
		File f = ed.getPageFiles().get(i);
161
		String wordid = ed.getIdx().get(i);
162
		params.addPage(edition, ""+(++i), wordid);
163
	}
164
}
165

  
166
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
167
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
168
File paramFile = new File(binDir, "import.xml");
169
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
170

  
171

  
172
// TMR Modifications start here
173
File tigerDir = new File(binDir, "tiger");
174
tigerDir.mkdir();
175
File master = new File(tigerSrcDir, "master.xml");
176
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file 
177

  
178
// Run SRCMF cmd line corpus import script 
179
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
180
def importer = new SrcmfImporter()
181
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
182

  
183
// Create TigerXml-POS-PNC
184
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
185
println "create TIGER XML POS PNC"
186
PunctInjectImport injector;
187
if (Toolbox.isInitialized()) {
188
	// get idx from CQP
189
	// INCOMPLETE
190
	// def injector = new PunctInjectImport(tmpFile, words)
191
	println "Error: the loader is not ready to be launched from RCP"
192
	return;
193
} else {
194
	injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
195
}
196
injector.outputFile = new File(tigerDir, "TigerPnc.xml")
197
injector.process()
198

  
199
// Create the tigersearch.logprop file (used to launch TIGERSearch)
200
File logprop = new File(tigerDir, "tigersearch.logprop");
201
logprop.withWriter("UTF-8") { writer ->
202
	writer.write("""# Default log configuration of the TIGERSearch suite
203

  
204
log4j.rootLogger=WARN,Logfile
205
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
206

  
207
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
208
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
209
log4j.appender.Logfile.MaxFileSize=500KB
210
log4j.appender.Logfile.MaxBackupIndex=1
211

  
212
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
213
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")	
214
}
215

  
216
// Run TigerRegistry
217
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
218
BasicConfigurator.configure();
219
String uri = tigerXmlAll.getAbsolutePath();
220
File tigerBinDir = new File(tigerDir, corpusname)
221
tigerBinDir.mkdir()
222
try {
223
	IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
224
	XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false);
225
	indexing.startIndexing();
226
}
227
catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
228
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
229

  
230
tigerXmlAll.delete()
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TigerXMLHeader.xsd (revision 1000)
1
<?xml version="1.0" encoding="UTF-8"?>
2
  <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
3

  
4
 <!-- =======================================================================
5
      XML SubSchema for the header part of the TIGER-XML format
6
      http://www.ims.uni-stuttgart.de/projekte/TIGER/publicTigerXMLHeader.xsd
7
      =======================================================================
8
      TIGER Project, Wolfgang Lezius 
9
      IMS, University of Stuttgart, 04/01/2003
10
      ======================================================================= -->
11

  
12

  
13
  <!-- ======================================================
14
       DECLARATION OF THE HEADER
15
       ====================================================== -->
16

  
17

  
18
  <!-- declaration of the head element -->
19

  
20
  <xs:element name="head" type="headType"/>
21

  
22

  
23
  <!-- declaration of the header type -->
24

  
25
  <xs:complexType name="headType">
26

  
27
     <xs:sequence>
28
        <xs:element name="meta" type="metaType" minOccurs="0" maxOccurs="1"/>
29
        <xs:element name="annotation" type="annotationType" minOccurs="0" maxOccurs="1"/>
30
     <xs:/sequence>    
31

  
32
     <!-- optional: reference to external header file 
33

  
34
          The header of a TigerXML corpus can also be stored in separate file. 
35
          This attribute points to the external header file. The pointer is
36
          an URI. Examples: file:relative.xml or file:/path/to/absolute.xml
37

  
38
          Note: If there is a pointer to an external file, the head
39
                element must be empty. -->
40

  
41
     <xs:attribute name="external" type="xsd:anyURI"/>  
42

  
43
  <xs:/complexType>
44

  
45

  
46
  <!-- declaration of the meta information type -->
47

  
48
  <xs:complexType name="metaType">
49

  
50
    <xs:sequence>
51
      <xs:element name="name" type="xsd:string" minOccurs="0" maxOccurs="1"/>
52
      <xs:element name="author" type="xsd:string" minOccurs="0" maxOccurs="1"/>
53
      <xs:element name="date" type="xsd:string" minOccurs="0" maxOccurs="1"/>
54
      <xs:element name="description" type="xsd:string" minOccurs="0" maxOccurs="1"/>
55
      <xs:element name="format" type="xsd:string" minOccurs="0" maxOccurs="1"/>
56
      <xs:element name="history" type="xsd:string" minOccurs="0" maxOccurs="1"/>
57
    <xs:/sequence>    
58

  
59
  <xs:/complexType>
60
  
61

  
62
  <!-- declaration of the annotation type -->
63

  
64
  <xs:complexType name="annotationType">
65

  
66
    <xs:sequence>
67
      <xs:element name="feature" type="featureType" minOccurs="1" maxOccurs="unbounded"/>
68
      <xs:element name="edgelabel" type="edgelabelType" minOccurs="0" maxOccurs="1"/>
69
      <xs:element name="secedgelabel" type="edgelabelType" minOccurs="0" maxOccurs="1"/>
70
    <xs:/sequence>
71

  
72
  <xs:/complexType>
73

  
74

  
75
  <!-- declaration of the feature type -->
76

  
77
  <xs:complexType name="featureType">
78

  
79
    <xs:sequence>
80
       <xs:element name="value" type="featurevalueType" minOccurs="0" maxOccurs="unbounded"/>
81
    <xs:/sequence>
82
    
83
    <xs:attribute name="name" type="featurenameType" use="required"/>
84

  
85
    <xs:attribute name="domain" use="required">
86
       <xs:simpleType>
87
         <xs:restriction base="xsd:string">
88
           <xs:enumeration value="T"/>     <!-- feature for terminal nodes -->
89
           <xs:enumeration value="NT"/>    <!-- feature for nonterminal nodes -->
90
           <xs:enumeration value="FREC"/>  <!-- feature for both -->
91
         <xs:/restriction>
92
       <xs:/simpleType>
93
    <xs:/attribute>
94

  
95
  <xs:/complexType>
96

  
97

  
98
  <!-- declaration of the (secondary) edge label type -->
99

  
100
  <xs:complexType name="edgelabelType">
101

  
102
    <xs:sequence>
103
       <xs:element name="value" type="featurevalueType" minOccurs="0" maxOccurs="unbounded"/>
104
    <xs:/sequence>
105
    
106
  <xs:/complexType>
107

  
108

  
109
  <!-- declaration of the feature value type -->
110

  
111
  <xs:complexType name="featurevalueType">
112

  
113
    <xs:simpleContent>   <!-- element content: documentation of the feature value -->
114
      <xs:extension base="xsd:string">
115
        <xs:attribute name="name" type="xsd:string"/>
116
      <xs:/extension>
117
    <xs:/simpleContent>
118

  
119

  
120
  <xs:/complexType>
121

  
122

  
123
  <!-- ======================================================
124
       HEADER DECLARATIONS THAT SHOULD BE REFINED
125
       ====================================================== -->
126

  
127
  <!-- declaration of the FEATURE NAMES used in the corpus header;
128
       this type is unrestricted, but should be refined by a 
129
       specialised, corpus-dependent schema -->
130

  
131
  <xs:simpleType name="featurenameType">
132

  
133
    <xs:restriction base="xsd:string">
134
      <xs:minLength value="1"/>
135
      <xs:maxLength value="20"/>
136
      <xs:whiteSpace value="preserve"/>
137
    <xs:/restriction>
138

  
139
  <xs:/simpleType>
140

  
141

  
142
<xs:/schema>
0 143

  
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TSImport.groovy (revision 1000)
1
package org.txm.scripts.importer.tigersearch;
2

  
3
import java.io.File;
4
import java.util.ArrayList;
5

  
6
import ims.tiger.index.writer.*
7
import ims.tiger.system.*
8

  
9
import org.txm.Toolbox;
10
import org.txm.importer.ApplyXsl2;
11
import org.txm.importer.xtz.*
12
import org.txm.objects.BaseParameters
13
import org.txm.utils.BundleUtils;
14
import org.txm.utils.io.FileCopy;
15
import org.apache.log4j.BasicConfigurator;
16
import org.txm.importer.xtz.*
17
import org.txm.scripts.importer.xtz.*
18

  
19
class TSImport extends XTZImport {
20

  
21
	public TSImport(BaseParameters params) {
22
		super(params);
23
	}
24

  
25
	@Override
26
	public void init(BaseParameters p) {
27
		super.init(p);
28

  
29
		importer = new TSImporter(this); // only to build metadata
30
		compiler = new XTZCompiler(this)
31
		annotater = null; // no annotater step to do
32
		pager = new XTZPager(this)
33
	}
34

  
35
	/**
36
	 * Do a XTZ Import then build the TIGERSearch indexes in the binary corpus "tiger" directory
37
	 */
38
	@Override
39
	public void start() throws InterruptedException {
40
		File tigerSrcDir = sourceDirectory
41

  
42
		def xmlFiles = []
43
		sourceDirectory.listFiles(new FileFilter() {
44
					boolean accept(File file) {
45
						if (file.isDirectory()) return false;
46
						if (file.isHidden()) return false;
47
						if (file.getName().equals("import.xml")) return false;
48
						if (!file.getName().endsWith(".xml")) return false;
49

  
50
						xmlFiles << file
51
					}
52
				});
53

  
54
		xmlFiles.remove(new File(sourceDirectory, "import.xml"))
55

  
56
		if (xmlFiles.size() == 0) {
57
			println "Error no XML file found in $sourceDirectory"
58
			isSuccessful = false;
59
			return;
60
		}
61

  
62
		File master = xmlFiles[0];
63
		println "Main TIGER XML file found: $master"
64

  
65
		File tsXSLFile = new File(Toolbox.getTXMHOMEPATH(), "xsl/ts.xsl");
66
		BundleUtils.copyFiles("TIGERSearchRCP", "src", "org/txm/importer/tigersearch", "ts.xsl", tsXSLFile.getParentFile());
67

  
68
		File xmltxmSrcDir = new File(binaryDirectory, "src"); // output directory of the TS XSL transformation
69
		xmltxmSrcDir.mkdirs();
70
		FileCopy.copy(master, new File(xmltxmSrcDir, master.getName()));
71

  
72
		if (!ApplyXsl2.processImportSources(tsXSLFile, xmltxmSrcDir, xmltxmSrcDir)) {
73
			println "Error while applying TS XSL file to $tigerSrcDir"
74
			isSuccessful = false;
75
			return;
76
		}
77

  
78
		File[] files = xmltxmSrcDir.listFiles();
79
		if (files == null || files.length == 0) {
80
			println "Error while applying TS XSL file to $xmltxmSrcDir is empty"
81
			isSuccessful = false;
82
			return;
83
		}
84

  
85
		sourceDirectory = xmltxmSrcDir; // hop
86
		File txmDir = new File(binaryDirectory, "txm/"+corpusName);
87
		txmDir.mkdirs();
88
		FileCopy.copyFiles(sourceDirectory, txmDir) // the compiler step will use these files
89

  
90
		super.start(); // call the usual XTZ import
91

  
92
		if (isSuccessful) {
93

  
94
			File tigerDir = new File(binaryDirectory, "tiger");
95
			tigerDir.mkdir();
96

  
97

  
98
			File logprop = new File(tigerDir, "tigersearch.logprop");
99

  
100
			logprop.withWriter("UTF-8") { writer ->
101
				writer.write("""# Default log configuration of the TIGERSearch suite
102
log4j.rootLogger=WARN,Logfile
103
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=WARNING
104
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff