Révision 1000
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/TigerSubcorpus2Main.groovy (revision 1000) | ||
---|---|---|
1 |
/* |
|
2 |
Combines subcorpus files to give a single main file. |
|
3 |
*/ |
|
4 |
// TXM package statement |
|
5 |
package org.txm.scripts.importer.srcmf |
|
6 |
|
|
7 |
// Command-line entry point |
|
8 |
|
|
9 |
def cli = new CliBuilder( |
|
10 |
usage:'TigerSubcorpus2Main.groovy [options] tiger_master.xml' |
|
11 |
) |
|
12 |
cli.h(longOpt:'help', 'Prints this message.') |
|
13 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
14 |
options = cli.parse(args) |
|
15 |
if (options.arguments().size() != 1) { |
|
16 |
println 'Incorrect number of command line arguments... exiting' |
|
17 |
println cli.usage() |
|
18 |
System.exit(2) |
|
19 |
} |
|
20 |
def tigerMaster = new File(options.arguments()[0]) |
|
21 |
def outputFile = new File('outputfile.xml') |
|
22 |
if (options.o) { |
|
23 |
outputFile = new File(options.o) |
|
24 |
} |
|
25 |
script(tigerMaster, outputFile) |
|
26 |
|
|
27 |
def script (File tigerMaster, File outputFile) { |
|
28 |
def masterDoc = new XmlParser().parse(tigerMaster) |
|
29 |
for (def body : masterDoc.body) { |
|
30 |
for (def subNode : body.subcorpus) { |
|
31 |
File subCorpusFile = new File(tigerMaster.getParentFile(), (""+subNode.@external).substring(5)) |
|
32 |
def subDoc = new XmlParser().parse(subCorpusFile) |
|
33 |
body.remove(subNode) |
|
34 |
for (def child : subDoc.children()) |
|
35 |
body.append(child) |
|
36 |
} |
|
37 |
} |
|
38 |
def strWriter = new StringWriter() |
|
39 |
new groovy.util.XmlNodePrinter(new PrintWriter(strWriter)).print(masterDoc) |
|
40 |
def rez = strWriter.toString() |
|
41 |
outputFile.withWriter("UTF-8") { writer -> |
|
42 |
writer.println('<?xml version="1.0" encoding="UTF-8"?>') |
|
43 |
writer.print(rez) |
|
44 |
} |
|
45 |
} |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/package.html (revision 1000) | ||
---|---|---|
1 |
<html> |
|
2 |
<body> |
|
3 |
<p>TIGERSearch import module. This is a prototype that can only manage SRCMF TIGERSearch sources</p> |
|
4 |
</body> |
|
5 |
</html> |
|
0 | 6 |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/MasterReheader.groovy (revision 1000) | ||
---|---|---|
1 |
#! /usr/bin/groovy |
|
2 |
package org.txm.scripts.importer.srcmf; |
|
3 |
|
|
4 |
/* |
|
5 |
* To change this template, choose Tools | Templates |
|
6 |
* and open the template in the editor. |
|
7 |
*/ |
|
8 |
|
|
9 |
// Set up globals |
|
10 |
// def masterFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos.xml') |
|
11 |
// def outputFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos2.xml') |
|
12 |
// def headerFile = new File('/home/tomr/Documents/Work/SRCMF/srcmf_ts/header_srcmf.xml') |
|
13 |
// def feats = [nt:['cat', 'type', 'coord'], t:['pos', 'form', 'q']] |
|
14 |
// def firstFeat = [t:'word', nt:'cat'] |
|
15 |
|
|
16 |
// Command-line entry point |
|
17 |
def cli = new CliBuilder( |
|
18 |
usage:'MasterReheader.groovy [options] master_file.xml header_file.xml' |
|
19 |
) |
|
20 |
cli.h(longOpt:'help', 'Prints this message.') |
|
21 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
22 |
cli.nt(longOpt:'ntfeats', args:1, argName:'ntfeats', 'NT features for which to provide value node.') |
|
23 |
cli.t(longOpt:'tfeats', args:1, argName:'tfeats', 'T features for which to provide value node.') |
|
24 |
cli.nt1(longOpt:'ntfeat1st', args:1, argName:'1st ntfeat', 'First NT feature listed in header.') |
|
25 |
cli.t1(longOpt:'tfeat1st', args:1, argName:'1st tfeat', 'First T feature listed in header.') |
|
26 |
options = cli.parse(args) |
|
27 |
if (options.arguments().size() == 2) { |
|
28 |
def masterFile = new File(options.arguments()[0]) |
|
29 |
def headerFile = new File(options.arguments()[1]) |
|
30 |
def masterFolder = masterFile.getCanonicalFile().getParent() |
|
31 |
def outputFile = null |
|
32 |
if (options.o) { |
|
33 |
outputFile = new File(options.o) |
|
34 |
} else { |
|
35 |
outputFile = new File(masterFolder, 'MasterReheader_out.xml') |
|
36 |
} |
|
37 |
def ntfirst = 'cat' |
|
38 |
if (options.nt1) { |
|
39 |
ntfirst = options.nt1 |
|
40 |
} |
|
41 |
def tfirst = 'word' |
|
42 |
if (options.t1) { |
|
43 |
tfirst = options.t1 |
|
44 |
} |
|
45 |
script( |
|
46 |
masterFile, headerFile, outputFile, |
|
47 |
['nt':options.nts, 't':options.ts], |
|
48 |
['nt':options.nt1, 't':options.t1] |
|
49 |
) |
|
50 |
} else { |
|
51 |
println 'Incorrect number of command line arguments... exiting' |
|
52 |
println cli.usage() |
|
53 |
} |
|
54 |
|
|
55 |
def script( |
|
56 |
File masterFile, File headerFile, File outputFile, HashMap feats, HashMap firstFeat |
|
57 |
) { |
|
58 |
// Load master and header files |
|
59 |
def master = new XmlParser().parse(masterFile) |
|
60 |
def header = new XmlParser().parse(headerFile) |
|
61 |
def masterFolder = masterFile.getCanonicalFile().getParent() |
|
62 |
|
|
63 |
// Set up locals |
|
64 |
def attrVal = [nt:[:], t:[:]] |
|
65 |
|
|
66 |
// Scan subcorpus files and build attribute lists. |
|
67 |
master.body.subcorpus.each { |
|
68 |
def subcorpusFile = new File (masterFolder, it.'@external'[5..-1]) |
|
69 |
def subcorpus = new XmlParser().parse(subcorpusFile) |
|
70 |
// Closure for t & nt nodes processing. |
|
71 |
def getvals = { node, type -> |
|
72 |
node.attributes().each { mEntry -> |
|
73 |
if (! attrVal[type].keySet().contains(mEntry.getKey())) { |
|
74 |
attrVal[type][mEntry.getKey()] = new HashSet() |
|
75 |
} |
|
76 |
attrVal[type][mEntry.getKey()].add(mEntry.getValue()) |
|
77 |
} |
|
78 |
} |
|
79 |
subcorpus.s.graph.terminals.t.each { getvals.call(it, 't') } |
|
80 |
subcorpus.s.graph.nonterminals.nt.each { getvals.call(it, 'nt') } |
|
81 |
} |
|
82 |
// Id isn't an attribute in the header. |
|
83 |
attrVal['t'].remove('id') |
|
84 |
attrVal['nt'].remove('id') |
|
85 |
// Remove old feature nodes in master file |
|
86 |
def oldFeatureNodes = master.head.annotation.feature |
|
87 |
while (oldFeatureNodes) { |
|
88 |
node = oldFeatureNodes.pop() |
|
89 |
node.parent().remove(node) |
|
90 |
} |
|
91 |
assert (! master.head.annotation.feature) |
|
92 |
// Check firstFeat was relevant |
|
93 |
['t', 'nt'].each { type -> |
|
94 |
if (! (attrVal[type].keySet().contains(firstFeat[type]))) { |
|
95 |
firstFeat[type] = attrVal[type].keySet().sort()[0] |
|
96 |
} |
|
97 |
} |
|
98 |
assert attrVal['t'].keySet().contains(firstFeat['t']) |
|
99 |
assert attrVal['nt'].keySet().contains(firstFeat['nt']) |
|
100 |
def featList = [:] |
|
101 |
['t', 'nt'].each { type -> |
|
102 |
featList[type] = [firstFeat[type]] |
|
103 |
featList[type].addAll(attrVal[type].keySet().findAll { it != firstFeat[type] }) |
|
104 |
} |
|
105 |
// Add new feature and value nodes |
|
106 |
['t', 'nt'].each { type -> |
|
107 |
featList[type].each { feat -> |
|
108 |
def fNode = new Node(master.head.annotation[0], 'feature', |
|
109 |
['domain':type.toUpperCase(), 'name':feat] |
|
110 |
) |
|
111 |
// Add value node if the node value is given in 'feats' |
|
112 |
if (feats[type].contains(feat)) { |
|
113 |
attrVal[type][feat].each { value -> |
|
114 |
assert header.'**'.feature |
|
115 |
assert header.'**'.feature[0].'@name' |
|
116 |
assert header.'**'.feature[0].'@domain' |
|
117 |
assert ['NT', 'T'].contains(header.'**'.feature[0].'@domain') |
|
118 |
def hFNode = header.'**'.feature.find { |
|
119 |
it.'@name' == feat && ( |
|
120 |
it.'@domain' == type.toUpperCase() || it.'@domain' == 'FREC' |
|
121 |
) |
|
122 |
} |
|
123 |
def vText = '[unknown]' |
|
124 |
if (hFNode && hFNode.value.find { it.'@name' == value }) { |
|
125 |
vText = hFNode.value.find { it.'@name' == value }.text() |
|
126 |
} |
|
127 |
new Node(fNode, 'value', ['name':value], vText) |
|
128 |
} |
|
129 |
} |
|
130 |
} |
|
131 |
} |
|
132 |
|
|
133 |
// Save to output_file |
|
134 |
outputFile.withWriter { writer -> |
|
135 |
writer << groovy.xml.XmlUtil.serialize(master) |
|
136 |
} |
|
137 |
} |
|
138 |
|
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/SrcmfImporter.groovy (revision 1000) | ||
---|---|---|
1 |
/* |
|
2 |
* Calls all import scripts |
|
3 |
*/ |
|
4 |
|
|
5 |
/** |
|
6 |
* |
|
7 |
* @author tmr |
|
8 |
*/ |
|
9 |
|
|
10 |
// Command line form of import statements: |
|
11 |
// import MasterReheader |
|
12 |
// import PunctInjectImport |
|
13 |
// import SubcorpusDataInject |
|
14 |
// import TigerSubcorpus2Main |
|
15 |
// import java.util.logging.FileHandler |
|
16 |
// import javax.xml.parsers.DocumentBuilderFactory |
|
17 |
|
|
18 |
// TXM package statement |
|
19 |
package org.txm.scripts.importer.srcmf |
|
20 |
|
|
21 |
import java.util.logging.*; |
|
22 |
|
|
23 |
|
|
24 |
// Command line entry point |
|
25 |
def cli = new CliBuilder( |
|
26 |
usage:'SrcmfImport.groovy [options] tiger_master.xml xml_txm.xml header_file.xml' |
|
27 |
) |
|
28 |
cli.h(longOpt:'help', 'Prints this message.') |
|
29 |
options = cli.parse(args) |
|
30 |
if (options.arguments().size() != 3) { |
|
31 |
println 'Incorrect number of command line arguments... exiting' |
|
32 |
println cli.usage() |
|
33 |
System.exit(2) |
|
34 |
} |
|
35 |
|
|
36 |
def tigerFile = new File(options.arguments()[0]) |
|
37 |
def txmFile = new File(options.arguments()[1]) |
|
38 |
def headerFile = new File(options.arguments()[2]) |
|
39 |
def tigerXmlAll = doAllButPnc( |
|
40 |
tigerFile, |
|
41 |
txmFile, |
|
42 |
headerFile, |
|
43 |
txmFile.getAbsoluteFile().getParentFile().getParentFile() |
|
44 |
) |
|
45 |
doPnc(tigerXmlAll, txmFile) |
|
46 |
tigerXmlAll.delete() |
|
47 |
|
|
48 |
def doAllButPnc(File tigerFile, File txmFile, File headerFile, File binDir) { |
|
49 |
// Run pos injection script |
|
50 |
File txmSrcDir = txmFile.getAbsoluteFile().getParentFile() |
|
51 |
File tigerDir = new File(binDir, "tiger") |
|
52 |
tigerDir.mkdir() |
|
53 |
File masterpos = new File(tigerDir, "master_pos.xml") |
|
54 |
File xmltxm = txmSrcDir.listFiles()[0] |
|
55 |
File logFile = new File(binDir, "tiger.log") |
|
56 |
def sdi = new SubcorpusDataInject( |
|
57 |
xmltxm, |
|
58 |
new FileHandler(logFile.getAbsolutePath()), "vers" |
|
59 |
) |
|
60 |
sdi.processMaster(tigerFile, masterpos) |
|
61 |
// Run reheader script |
|
62 |
def reheader = new MasterReheader() |
|
63 |
File tmp = File.createTempFile("tmp", ".xml",tigerDir) |
|
64 |
def feats = ['nt':['cat', 'type', 'coord'], 't':['pos', 'form', 'q']] |
|
65 |
def firstFeat = ['nt':'cat', 't':'word'] |
|
66 |
reheader.script(masterpos, headerFile, tmp, feats, firstFeat) |
|
67 |
if (!tmp.exists()) { |
|
68 |
println "Error: reheader failed" |
|
69 |
} |
|
70 |
masterpos.delete() |
|
71 |
tmp.renameTo(masterpos) |
|
72 |
// Run merge master & subcorpus script |
|
73 |
def tigerXmlAll = new File(masterpos.getParentFile(), "TigerAll.xml") |
|
74 |
def mergescript = new TigerSubcorpus2Main() |
|
75 |
mergescript.script(masterpos, tigerXmlAll) |
|
76 |
return tigerXmlAll |
|
77 |
} |
|
78 |
|
|
79 |
def doPnc(File tigerXmlAll, File txmFile) { |
|
80 |
injector = new PunctInjectImport(tigerXmlAll, txmFile) |
|
81 |
injector.outputFile = new File(tigerXmlAll.getParentFile(), "TigerPnc.xml") |
|
82 |
injector.process() |
|
83 |
} |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/SubcorpusDataInject.groovy (revision 1000) | ||
---|---|---|
1 |
#! /usr/bin/groovy |
|
2 |
package org.txm.scripts.importer.srcmf; |
|
3 |
/* |
|
4 |
* To change this template, choose Tools | Templates |
|
5 |
* and open the template in the editor. |
|
6 |
*/ |
|
7 |
|
|
8 |
import java.util.logging.Logger |
|
9 |
import java.util.logging.FileHandler |
|
10 |
import java.util.logging.SimpleFormatter |
|
11 |
import java.util.logging.Level |
|
12 |
|
|
13 |
|
|
14 |
class SubcorpusDataInject { |
|
15 |
// Globals |
|
16 |
def textForm = 'prose' |
|
17 |
def TEI = null |
|
18 |
def wNodes = [] |
|
19 |
def wNodesInQ = [] |
|
20 |
// Set up namespaces |
|
21 |
static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml') |
|
22 |
static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm') |
|
23 |
// Set up a logger |
|
24 |
static logger = Logger.getLogger('subcorpusDataInjectLog') |
|
25 |
// END GLOBALS |
|
26 |
|
|
27 |
SubcorpusDataInject ( |
|
28 |
File xmlTxmFile, |
|
29 |
FileHandler fh, |
|
30 |
String newTextForm |
|
31 |
) { |
|
32 |
// Reset global values |
|
33 |
if (newTextForm) { |
|
34 |
textForm = newTextForm |
|
35 |
} |
|
36 |
// Set up logger |
|
37 |
def formatter = new SimpleFormatter() |
|
38 |
logger.addHandler(fh) |
|
39 |
logger.setLevel(Level.ALL) |
|
40 |
fh.setFormatter(formatter) |
|
41 |
// Parse the XML-TXM file |
|
42 |
logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).") |
|
43 |
TEI = new XmlParser().parse(xmlTxmFile) |
|
44 |
// Get text form from TEI header (if expressed) |
|
45 |
logger.log(Level.INFO, "Reading TEI header to detect text form.") |
|
46 |
if (TEI.teiHeader.profileDesc.textClass.catRef) { |
|
47 |
def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1] |
|
48 |
textForm = TEI.teiHeader.encodingDesc.'**'.category.find { |
|
49 |
it.'@id' == catId || it.attribute(nsXml.id) == catId |
|
50 |
}.catDesc[0].text() |
|
51 |
logger.log(Level.INFO, "Found text form '$textForm' in TEI header.") |
|
52 |
} else { |
|
53 |
logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.") |
|
54 |
} |
|
55 |
// Convert Lbs to Property |
|
56 |
if (textForm == 'vers' || textForm == 'mixte') { |
|
57 |
logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.") |
|
58 |
def undone = TEI.text.body |
|
59 |
def lastLb = false |
|
60 |
def lastWord = null |
|
61 |
while (undone) { |
|
62 |
def aNode = undone.remove(0) |
|
63 |
if (aNode.name().getLocalPart() == 'lb') { |
|
64 |
lastLb = true |
|
65 |
if (lastWord) { |
|
66 |
lastWord.'@lb' = 'end' |
|
67 |
} |
|
68 |
} else if (aNode.name().getLocalPart() == 'w') { |
|
69 |
if (!punctTest(aNode)) { |
|
70 |
if (lastLb) { |
|
71 |
aNode.'@lb' = 'start' |
|
72 |
} |
|
73 |
lastWord = aNode |
|
74 |
lastLb = false |
|
75 |
} |
|
76 |
} else { |
|
77 |
undone.addAll(0, aNode.children().findAll { it.getClass() == Node }) |
|
78 |
} |
|
79 |
} |
|
80 |
} |
|
81 |
// Build node lists |
|
82 |
logger.log(Level.INFO, "Building node lists.") |
|
83 |
wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' } |
|
84 |
wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' } |
|
85 |
} |
|
86 |
|
|
87 |
static main(def args) { |
|
88 |
// Parse command line args |
|
89 |
def outputFile = 'test-out.xml' |
|
90 |
def logName = 'subcorpusDataInjectLog.log' |
|
91 |
def cli = new CliBuilder( |
|
92 |
usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml' |
|
93 |
) |
|
94 |
cli.h(longOpt:'help', 'Prints this message') |
|
95 |
cli.m(longOpt:'master', 'Specifies input file is a TS master file.') |
|
96 |
cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.') |
|
97 |
cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.') |
|
98 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file') |
|
99 |
println args |
|
100 |
def options = cli.parse(args) |
|
101 |
// Call XML TXM loader |
|
102 |
if (options.arguments().size() == 2) { |
|
103 |
def arglist = [new File (options.arguments()[0])] |
|
104 |
if (options.logfile) { |
|
105 |
arglist.add(new FileHandler(options.logfile)) |
|
106 |
} else { |
|
107 |
arglist.add(new FileHandler(logName)) |
|
108 |
} |
|
109 |
if (options.v) { |
|
110 |
arglist.add('vers') |
|
111 |
} else { |
|
112 |
arglist.add('') |
|
113 |
} |
|
114 |
def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2]) |
|
115 |
// Call process master or process subcorpus |
|
116 |
arglist = [new File(options.arguments()[1]).getCanonicalFile()] |
|
117 |
if (options.output) { |
|
118 |
arglist.add(new File (options.output)) |
|
119 |
} else { |
|
120 |
arglist.add(new File (outputFile)) |
|
121 |
} |
|
122 |
if (options.m) { |
|
123 |
sdi.processMaster(arglist[0], arglist[1]) |
|
124 |
} else { |
|
125 |
sdi.processSubcorpus(arglist[0], arglist[1]) |
|
126 |
} |
|
127 |
} else { |
|
128 |
println "Wrong number of arguments." |
|
129 |
println cli.usage() |
|
130 |
} |
|
131 |
} |
|
132 |
|
|
133 |
def processMaster (File masterFile, File outputFile) { |
|
134 |
// Runs the transform on each subcorpus listed in a master file. |
|
135 |
// Resaves the master file. |
|
136 |
logger.log(Level.INFO, "Loading corpus master file") |
|
137 |
def masterFolder = '' |
|
138 |
def outputFolder = '' |
|
139 |
if (masterFile.getParent()) { |
|
140 |
// If not cwd |
|
141 |
masterFolder = masterFile.getParent() |
|
142 |
} |
|
143 |
if (outputFile.getParent()) { |
|
144 |
// If not cwd |
|
145 |
outputFolder = outputFile.getParent() |
|
146 |
} |
|
147 |
def master = new XmlParser().parse(masterFile) |
|
148 |
// Process each subcorpus file |
|
149 |
master.body.subcorpus.each { |
|
150 |
processSubcorpus( |
|
151 |
new File (masterFolder, it.'@external'[5..-1]), |
|
152 |
new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml") |
|
153 |
) |
|
154 |
it.'@external' = "${it.'@external'[0..-5]}_pos.xml" |
|
155 |
} |
|
156 |
logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}") |
|
157 |
|
|
158 |
outputFile.withWriter { writer -> |
|
159 |
writer << groovy.xml.XmlUtil.serialize(master) |
|
160 |
} |
|
161 |
logger.log(Level.INFO, "Run reheader on this file!") |
|
162 |
} |
|
163 |
|
|
164 |
def processSubcorpus (File subcorpusFile, File outputFile) { |
|
165 |
// Parse Subcorpus file |
|
166 |
logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}") |
|
167 |
def TS = new XmlParser().parse(subcorpusFile) |
|
168 |
// This is used as a stack |
|
169 |
def tNodes = TS.s.graph.terminals.t |
|
170 |
// This isn't |
|
171 |
def tNodesFixed = TS.s.graph.terminals.t |
|
172 |
|
|
173 |
// Build duplicata correspondance list |
|
174 |
logger.log(Level.INFO, "Building duplicata correspondance list") |
|
175 |
def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl -> |
|
176 |
def duplId = dupl.'@id' |
|
177 |
def realId = dupl.parent().parent().nonterminals.nt.edge.find { |
|
178 |
it.'@idref' == duplId |
|
179 |
}.parent().secedge.find { |
|
180 |
it.'@label' == 'dupl' |
|
181 |
}.'@idref' |
|
182 |
[duplId, realId] |
|
183 |
} |
|
184 |
logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.") |
|
185 |
|
|
186 |
// Add pos & form tags |
|
187 |
logger.log(Level.INFO, "Adding PoS and form tags to each t-node") |
|
188 |
// Get first wNode corresponding to a tNode |
|
189 |
def i = 0 |
|
190 |
def wIx = null |
|
191 |
while (i < tNodes.size()) { |
|
192 |
// Avoid trying to find "fake" first words, it's very inefficient |
|
193 |
if (tNodes[i].'@word' != '#') { |
|
194 |
wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) }) |
|
195 |
} |
|
196 |
if (wIx) { |
|
197 |
break |
|
198 |
} else { |
|
199 |
i++ |
|
200 |
} |
|
201 |
} |
|
202 |
assert wIx, "$i" |
|
203 |
/* |
|
204 |
* MAIN ITERATOR BEGINS HERE FOR POS INJECTION |
|
205 |
*/ |
|
206 |
// Iterate over tNodes. |
|
207 |
while (tNodes) { |
|
208 |
def thisTNode = tNodes.remove(0) |
|
209 |
def thisTNodeId = thisTNode.'@editionId' |
|
210 |
def wNode = null |
|
211 |
def q = false |
|
212 |
// Define the PoS / q / form injecting closure |
|
213 |
def injectPos = { |
|
214 |
// Q tag |
|
215 |
if (q) { |
|
216 |
thisTNode.'@q' = 'y' |
|
217 |
} else { |
|
218 |
thisTNode.'@q' = 'n' |
|
219 |
} |
|
220 |
// Form tag |
|
221 |
if (textForm == 'prose') { |
|
222 |
thisTNode.'@form' = 'prose' |
|
223 |
} else if ( |
|
224 |
textForm == 'vers' || |
|
225 |
( |
|
226 |
textForm == 'mixte' |
|
227 |
&& wNode.parent().parent().name().getLocalPart() == 'ab' |
|
228 |
) |
|
229 |
) { |
|
230 |
if (! wNode.'@lb') { |
|
231 |
thisTNode.'@form' = 'vers' |
|
232 |
} else if (wNode.'@lb' == 'start') { |
|
233 |
thisTNode.'@form' = 'vers_debut' |
|
234 |
} else if (wNode.'@lb' == 'end') { |
|
235 |
thisTNode.'@form' = 'vers_fin' |
|
236 |
} |
|
237 |
} |
|
238 |
// DEBUG TMR 22/08/2012 |
|
239 |
else { |
|
240 |
thisTNode.'@form' = 'prose' |
|
241 |
} |
|
242 |
// END DEBUG TMR 22/08/2012 |
|
243 |
// Find PoS tag in BFM file |
|
244 |
// Try txm:ana tags |
|
245 |
def txmAnaList = wNode[nsTxm.ana] |
|
246 |
// Then interp tags |
|
247 |
if (! txmAnaList) { |
|
248 |
txmAnaList = wNode.interp |
|
249 |
} |
|
250 |
// Try type="#pos_syn" |
|
251 |
if (txmAnaList.find {it.'@type' == '#pos_syn'}) { |
|
252 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text() |
|
253 |
// type attr of wNode |
|
254 |
} else if (! txmAnaList && wNode.'@type') { |
|
255 |
thisTNode.'@pos' = wNode.'@type' |
|
256 |
// type="#pos" |
|
257 |
} else if (txmAnaList.find { it.'@type' = '#pos' }) { |
|
258 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text() |
|
259 |
// type="#fropos" |
|
260 |
} else if (txmAnaList.find { it.'@type' = '#fropos' }) { |
|
261 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text() |
|
262 |
} else { |
|
263 |
thisTNode.'@pos' = '--' |
|
264 |
} |
|
265 |
} |
|
266 |
// End of injecting closure. |
|
267 |
// Check whether wNode matches tNode |
|
268 |
if (thisTNode.'@word' == '#') { |
|
269 |
// Not expecting to find these in BFM file, all OK. |
|
270 |
thisTNode.'@pos' = '--' |
|
271 |
thisTNode.'@form' = '--' |
|
272 |
thisTNode.'@q' = '--' |
|
273 |
} else if ( idmatch(thisTNode, wNodes[wIx]) ) { |
|
274 |
// Perfect match |
|
275 |
wNode = wNodes[wIx] |
|
276 |
q = wNodesInQ.contains(wNode) |
|
277 |
injectPos.call() |
|
278 |
wIx++ |
|
279 |
} else if (thisTNode.'@word' == '*') { |
|
280 |
// Duplicata, use dupl match list to find correct wNode |
|
281 |
def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' } |
|
282 |
if (duplIdMatch) { |
|
283 |
def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] } |
|
284 |
// To enhance performance, I assume initially that the wNode matching the |
|
285 |
// duplicata is within +- 10 words of the current position in the |
|
286 |
// list. The wNodes list is very long, and searches over the entire |
|
287 |
// list are avoided. |
|
288 |
assert matchTNode, "$duplIdMatch" |
|
289 |
if (wIx + 10 < wNodes.size() && wIx - 10 > 0) { |
|
290 |
wNode = wNodes[wIx-10 .. wIx+10].find { |
|
291 |
idmatch(matchTNode, it) |
|
292 |
} |
|
293 |
} |
|
294 |
// Use whole list if it failed. |
|
295 |
if (! wNode) { |
|
296 |
wNode = wNodes.find { |
|
297 |
idmatch(matchTNode, it) |
|
298 |
} |
|
299 |
} |
|
300 |
q = wNodesInQ.contains(wNode) |
|
301 |
injectPos.call() |
|
302 |
// Don't update wIx value! |
|
303 |
} else { |
|
304 |
thisTNode.'@pos' = '--' |
|
305 |
thisTNode.'@form' = '--' |
|
306 |
thisTNode.'@q' = '--' |
|
307 |
logger.log( |
|
308 |
Level.WARNING, |
|
309 |
"<w/> node matching TS duplicata $thisTNodeId not found." |
|
310 |
) |
|
311 |
} |
|
312 |
} else if (punctTest(wNodes[wIx])) { |
|
313 |
// not expecting to find these in TS file; try same tNode again. |
|
314 |
tNodes.add(0, thisTNode) |
|
315 |
wIx++ |
|
316 |
} else if (! |
|
317 |
wNodes.find { |
|
318 |
idmatch (thisTNode, it) |
|
319 |
} |
|
320 |
) { |
|
321 |
// Check the SRCMF corpus hasn't acquired an extra word |
|
322 |
logger.log( |
|
323 |
Level.WARNING, |
|
324 |
"TS word $thisTNodeId (${thisTNode.'@word'}) is " + |
|
325 |
"absent from BFM file." |
|
326 |
) |
|
327 |
thisTNode.'@pos' = '--' |
|
328 |
thisTNode.'@form' = '--' |
|
329 |
thisTNode.'@q' = '--' |
|
330 |
} else { |
|
331 |
// it's doesn't match, it's not a duplicata, |
|
332 |
// it's not BFM punctuation, but it IS found somewhere in the BFM |
|
333 |
// corpus. |
|
334 |
logger.log( |
|
335 |
Level.WARNING, |
|
336 |
"Discontinuity in TS corpus before word $thisTNodeId " + |
|
337 |
"(${thisTNode.'@word'})." |
|
338 |
) |
|
339 |
wNode = wNodes.find { |
|
340 |
idmatch(thisTNode, it) |
|
341 |
} |
|
342 |
q = wNodesInQ.contains(wNode) |
|
343 |
injectPos.call() |
|
344 |
wIx = wNodes.indexOf(wNode) + 1 |
|
345 |
} |
|
346 |
} |
|
347 |
/* |
|
348 |
* End Main iteration |
|
349 |
*/ |
|
350 |
/******************* |
|
351 |
* Recalculate headpos |
|
352 |
*/ |
|
353 |
logger.log(Level.INFO, "Recalculating headpos.") |
|
354 |
TS.s.graph.nonterminals.nt.each { nt -> |
|
355 |
def a = nt.edge.findAll { it.'@label' == 'L' } |
|
356 |
if (a.size() == 0) { |
|
357 |
nt.'@headpos' = '--' |
|
358 |
} else if (a.size() == 1) { |
|
359 |
nt.'@headpos' = nt.parent().parent().terminals.t.find { |
|
360 |
it.'@id' == a[0].'@idref' |
|
361 |
}.'@pos' |
|
362 |
} else { |
|
363 |
for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) { |
|
364 |
if ( |
|
365 |
a.each { edge -> |
|
366 |
nt.parent().parent().terminals.t.find { |
|
367 |
it.'@id' == edge.'@idref' |
|
368 |
}.'@pos' |
|
369 |
}.contains(headpos) |
|
370 |
) { |
|
371 |
nt.'@headpos' = headpos + '?' |
|
372 |
break |
|
373 |
} |
|
374 |
} |
|
375 |
} |
|
376 |
assert nt.'@headpos' |
|
377 |
} |
|
378 |
logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}") |
|
379 |
|
|
380 |
outputFile.withWriter { writer -> |
|
381 |
writer << groovy.xml.XmlUtil.serialize(TS) |
|
382 |
} |
|
383 |
} |
|
384 |
// Subroutine self-test to establish whether a Groovy node is punctuation |
|
385 |
private punctTest(gNode) { |
|
386 |
if (gNode.'@type') { |
|
387 |
gNode.'@type'.toLowerCase().startsWith('pon') |
|
388 |
} else { |
|
389 |
if (gNode[nsTxm.ana]) { |
|
390 |
return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null |
|
391 |
} else if (gNode.interp) { |
|
392 |
return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null |
|
393 |
} else return false |
|
394 |
} |
|
395 |
} |
|
396 |
|
|
397 |
private idmatch (tNode, wNode) { |
|
398 |
// Matches a TS node with a <w/> node |
|
399 |
def idMatch = false |
|
400 |
def wNodeId = wNode.'@id' |
|
401 |
if (! wNodeId) { |
|
402 |
wNodeId = wNode.attribute(nsXml.id) |
|
403 |
} |
|
404 |
def tNodeId = tNode.'@editionId' |
|
405 |
def regex = /(\d+)([_a-z]+)?$/ |
|
406 |
def m1 = (tNodeId =~ regex) |
|
407 |
def m2 = (wNodeId =~ regex) |
|
408 |
if ( m1.find() && m2.find() ) { |
|
409 |
idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger()) |
|
410 |
} else { |
|
411 |
logger.log( |
|
412 |
Level.WARNING, |
|
413 |
"Idmatch routine can't find integer part of one or both of the" + |
|
414 |
"paired Ids for nodes $tNode, $wNode" |
|
415 |
) |
|
416 |
} |
|
417 |
if (! idMatch) { |
|
418 |
return false |
|
419 |
} |
|
420 |
|
|
421 |
//def wNodeForm = wNode.text() // Enable if treating old-style XML-BFM |
|
422 |
def wNodeForm = ""; |
|
423 |
if (! wNodeForm) { |
|
424 |
wNodeForm = getDeepText(wNode[nsTxm.form][0]) |
|
425 |
} |
|
426 |
def tNodeForm = tNode.'@word' |
|
427 |
def formMatch = (tNodeForm == wNodeForm) |
|
428 |
if (idMatch && formMatch) { |
|
429 |
return true |
|
430 |
} else { |
|
431 |
logger.log(Level.WARNING, |
|
432 |
"Matching IDs ($tNodeId) but unmatched forms: TS form '$tNodeForm', BFM form '$wNodeForm'.") |
|
433 |
return true |
|
434 |
} |
|
435 |
} |
|
436 |
|
|
437 |
def getDeepText(def node) { |
|
438 |
def str = "" |
|
439 |
node.children().each { child -> |
|
440 |
if (child instanceof Node) { |
|
441 |
str += getDeepText(child) |
|
442 |
} else if (child instanceof String) { |
|
443 |
str += child |
|
444 |
} |
|
445 |
} |
|
446 |
return str |
|
447 |
} |
|
448 |
} |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/PunctInjectImport.groovy (revision 1000) | ||
---|---|---|
1 |
// Script to restore punctuation etc. to results of TS query. |
|
2 |
// Inputs: |
|
3 |
// --- Tiger-XML Document node |
|
4 |
// --- Java array: |
|
5 |
// --- --- String [index][type] where: |
|
6 |
// --- --- --- type == 0 gives the xml:id |
|
7 |
// --- --- --- type == 1 gives the word form |
|
8 |
// Process: |
|
9 |
// --- Injects punctuation. |
|
10 |
// Returns: |
|
11 |
// --- Tiger-XML Document node. |
|
12 |
|
|
13 |
// TXM package statement |
|
14 |
package org.txm.scripts.importer.srcmf |
|
15 |
|
|
16 |
import javax.xml.parsers.DocumentBuilderFactory |
|
17 |
|
|
18 |
|
|
19 |
|
|
20 |
/** |
|
21 |
* |
|
22 |
* @author tmr |
|
23 |
* |
|
24 |
*/ |
|
25 |
class PunctInjectImport |
|
26 |
{ |
|
27 |
|
|
28 |
static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml') |
|
29 |
static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei') |
|
30 |
def tigerXml = null |
|
31 |
def txmIdWordTableFixed = [] |
|
32 |
def outputFile = null |
|
33 |
|
|
34 |
/* |
|
35 |
* Call this constructor when no need of XML-TXM file for word ID table |
|
36 |
*/ |
|
37 |
PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) { |
|
38 |
def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
|
39 |
tigerXml = builder.parse(tigerXmlFile).documentElement |
|
40 |
txmIdWordTableFixed = txmIdWordTable |
|
41 |
outputFile = new File("output.xml") |
|
42 |
} |
|
43 |
|
|
44 |
/* |
|
45 |
* Call this constructor when the XML-TXM file is needed for the word list. |
|
46 |
*/ |
|
47 |
PunctInjectImport(File tigerXmlFile, File xmlTxmFile) { |
|
48 |
// |
|
49 |
def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
|
50 |
tigerXml = builder.parse(tigerXmlFile).documentElement |
|
51 |
txmIdWordTableFixed = [] |
|
52 |
def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile) |
|
53 |
for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) { |
|
54 |
def anId = null |
|
55 |
anId = w.'@id' |
|
56 |
if (! anId) { |
|
57 |
anId = w.attribute(nsXml.id) |
|
58 |
} |
|
59 |
if (! anId) { |
|
60 |
println "Error: ID attribute not located in XML TXM file" |
|
61 |
} |
|
62 |
txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])] |
|
63 |
} |
|
64 |
// assert txmIdWordTableFixed.size() > 0 |
|
65 |
outputFile = new File("output.xml") |
|
66 |
} |
|
67 |
|
|
68 |
def process () { |
|
69 |
def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed) |
|
70 |
def allTs = toList(this.tigerXml.getElementsByTagName('t')) |
|
71 |
def tAttrs = getTAttrNames(allTs[0]) |
|
72 |
def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first() |
|
73 |
def addedWordIds = [] |
|
74 |
int tIx = 0 |
|
75 |
while (txmIdWordTable) { |
|
76 |
def word = txmIdWordTable.remove(0) |
|
77 |
if (tIx == allTs.size()) { |
|
78 |
// End of TS file, but still words left in the BFM file. |
|
79 |
addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append') |
|
80 |
} |
|
81 |
else { |
|
82 |
def tNode = allTs[tIx] |
|
83 |
def tId = getTNodeId(tNode) |
|
84 |
if (tId == word[0] && tNode.getAttribute('word') == word[1]) { |
|
85 |
// alles gut |
|
86 |
tIx += 1 |
|
87 |
} |
|
88 |
else if (tId == word[0]) { |
|
89 |
println("Mismatched Ids! ($tId)") |
|
90 |
tIx += 1 |
|
91 |
} |
|
92 |
else if (['#', '*'].contains(tNode.getAttribute('word')) ) { |
|
93 |
// SRCMF duplicata; try comparing word against the next tNode next time |
|
94 |
// around. |
|
95 |
txmIdWordTable.add(0, word) |
|
96 |
tIx += 1 |
|
97 |
} |
|
98 |
// Check that the SRCMF corpus doesn't have a bug in it... |
|
99 |
else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/) |
|
100 |
&& (allTs[0..tIx - 1].find{ |
|
101 |
it.getAttribute('id') == "$tNodeIdPrefix#$tId" |
|
102 |
})) { |
|
103 |
println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!" |
|
104 |
txmIdWordTable.add(0, word) |
|
105 |
tIx += 1 |
|
106 |
} |
|
107 |
// Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change) |
|
108 |
else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/) |
|
109 |
&& (allTs[tIx..-1].find{ |
|
110 |
it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}" |
|
111 |
})) { |
|
112 |
println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!" |
|
113 |
txmIdWordTable.add(0, word) |
|
114 |
tIx += 1 |
|
115 |
} |
|
116 |
else if (addedWordIds.contains(tId)) { |
|
117 |
println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!" |
|
118 |
txmIdWordTable.add(0, word) |
|
119 |
tIx += 1 |
|
120 |
} |
|
121 |
else { |
|
122 |
// Insert word. In the first instance, it will have the same parent as |
|
123 |
// the tNode before which it's being inserted. |
|
124 |
addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before') |
|
125 |
addedWordIds.add(word[0]) |
|
126 |
} |
|
127 |
} |
|
128 |
} |
|
129 |
// Second phase: move punctuation into previous sentence, |
|
130 |
// dependent on sequence. |
|
131 |
def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals')) |
|
132 |
for (def i = 1 ; i < allTerminalses.size() ; i++) { |
|
133 |
def ts = toList(allTerminalses[i].getElementsByTagName('t')) |
|
134 |
def startPunc = true |
|
135 |
def puncStack = [] |
|
136 |
while (ts && startPunc) { |
|
137 |
if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) { |
|
138 |
puncStack.add(ts.remove(0)) |
|
139 |
} |
|
140 |
else { |
|
141 |
startPunc = false |
|
142 |
} |
|
143 |
} |
|
144 |
// Now, treat the punctuation stack at the beginning of the sentence |
|
145 |
if ( puncStack ) { |
|
146 |
int moveLeft = 0 |
|
147 |
// First, identify LAST instance of sentence-final punctuation. |
|
148 |
def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('') |
|
149 |
def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/ |
|
150 |
if (matches.size() > 0) { |
|
151 |
moveLeft = puncString.lastIndexOf(matches[-1]) + 1 |
|
152 |
} |
|
153 |
// Second, split pairs of straight quotes |
|
154 |
matches = puncString =~ /(""|'')/ //" |
|
155 |
if (matches.size() > 0) { |
|
156 |
moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max() |
|
157 |
} |
|
158 |
// Now, move moveLeft punctuation nodes to the end of the prev. sentence |
|
159 |
ts = toList(allTerminalses[i].getElementsByTagName('t')) |
|
160 |
for (def j = 0 ; j < moveLeft ; j++ ) { |
|
161 |
allTerminalses[i - 1].appendChild(ts[j]) |
|
162 |
} |
|
163 |
} |
|
164 |
} |
|
165 |
outputFile.withWriter("UTF-8") { writer -> |
|
166 |
writer.println(this.tigerXml) |
|
167 |
} |
|
168 |
} |
|
169 |
|
|
170 |
private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) { |
|
171 |
def newTNode = tNode.getOwnerDocument().createElement('t') |
|
172 |
for (def anAttr : tAttrs) { |
|
173 |
if (anAttr == 'id') { |
|
174 |
newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}") |
|
175 |
} |
|
176 |
else if (anAttr == 'word') { |
|
177 |
newTNode.setAttribute('word', word[1]) |
|
178 |
} |
|
179 |
else { |
|
180 |
newTNode.setAttribute(anAttr, '--') |
|
181 |
} |
|
182 |
} |
|
183 |
if (where == 'before') { |
|
184 |
tNode.getParentNode().insertBefore(newTNode, tNode) |
|
185 |
} |
|
186 |
else if (where == 'append') { |
|
187 |
tNode.getParentNode().appendChild(newTNode) |
|
188 |
} |
|
189 |
else { |
|
190 |
throw new IllegalArgumentException('Bad before value') |
|
191 |
} |
|
192 |
} |
|
193 |
def getTAttrNames(tNode) { |
|
194 |
def nodeMap = tNode.attributes |
|
195 |
def nameList = [] |
|
196 |
for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) { |
|
197 |
nameList.add( nodeMap.item(i).nodeName ) |
|
198 |
} |
|
199 |
return nameList |
|
200 |
} |
|
201 |
def getTNodeId(tNode) { |
|
202 |
return tNode.getAttribute('id').tokenize('#').last() |
|
203 |
} |
|
204 |
def toList(def iterable) { |
|
205 |
return iterable.findAll {true}; |
|
206 |
} |
|
207 |
def getDeepText(def node) { |
|
208 |
def str = "" |
|
209 |
node.children().each { child -> |
|
210 |
if (child instanceof Node) { |
|
211 |
str += getDeepText(child) |
|
212 |
} else if (child instanceof String) { |
|
213 |
str += child |
|
214 |
} |
|
215 |
} |
|
216 |
return str |
|
217 |
} |
|
218 |
|
|
219 |
public static void main(String[] args) |
|
220 |
{ |
|
221 |
def cli = new CliBuilder( |
|
222 |
usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml' |
|
223 |
) |
|
224 |
cli.h(longOpt:'help', 'Prints this message.') |
|
225 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
226 |
def options = cli.parse(args) |
|
227 |
def tigerXmlFile = null |
|
228 |
def xmlTxmFile = null |
|
229 |
if (options.arguments().size() == 2) { |
|
230 |
tigerXmlFile = new File(options.arguments()[0]) |
|
231 |
xmlTxmFile = new File(options.arguments()[1]) |
|
232 |
} else { |
|
233 |
println 'Incorrect number of command line arguments... exiting' |
|
234 |
println cli.usage() |
|
235 |
System.exit(2) |
|
236 |
} |
|
237 |
def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile) |
|
238 |
if (options.o) { |
|
239 |
pii.outputFile = new File(options.o) |
|
240 |
} |
|
241 |
pii.process() |
|
242 |
} |
|
243 |
} |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/srcmfLoader.groovy (revision 1000) | ||
---|---|---|
1 |
package org.txm.scripts.importer.srcmf |
|
2 |
|
|
3 |
// srcdir |
|
4 |
// txml/master.xml |
|
5 |
// txml/*.xml |
|
6 |
// txm/*.xml |
|
7 |
|
|
8 |
// 1 import xml-txm |
|
9 |
import ims.tiger.gui.tigerregistry.TIGERRegistry; |
|
10 |
import ims.tiger.index.writer.IndexBuilderErrorHandler; |
|
11 |
import ims.tiger.index.writer.SimpleErrorHandler; |
|
12 |
import ims.tiger.index.writer.XMLIndexing; |
|
13 |
import ims.tiger.system.*; |
|
14 |
|
|
15 |
import javax.xml.stream.*; |
|
16 |
|
|
17 |
import java.io.BufferedWriter; |
|
18 |
import java.io.File; |
|
19 |
import java.io.FileOutputStream; |
|
20 |
import java.io.IOException; |
|
21 |
import java.io.OutputStreamWriter; |
|
22 |
import java.io.PrintStream; |
|
23 |
import java.net.URL; |
|
24 |
import java.util.HashMap; |
|
25 |
import java.util.logging.FileHandler |
|
26 |
import java.io.File; |
|
27 |
|
|
28 |
import org.apache.log4j.BasicConfigurator; |
|
29 |
import org.txm.export.ts.*; |
|
30 |
import org.txm.importer.ApplyXsl2; |
|
31 |
import org.txm.utils.xml.DomUtils; |
|
32 |
import org.txm.importer.ValidateXml; |
|
33 |
import org.txm.importer.XPathResult; |
|
34 |
import org.txm.importer.xmltxm.compiler; |
|
35 |
import org.txm.importer.xml.pager; |
|
36 |
import org.txm.objects.*; |
|
37 |
import org.txm.utils.* |
|
38 |
import org.txm.utils.io.*; |
|
39 |
import org.txm.*; |
|
40 |
import org.txm.importer.xmltxm.*; |
|
41 |
import org.txm.metadatas.*; |
|
42 |
import org.txm.utils.i18n.*; |
|
43 |
import org.w3c.dom.Element |
|
44 |
import org.xml.sax.SAXException; |
|
45 |
|
|
46 |
import javax.xml.parsers.DocumentBuilderFactory |
|
47 |
|
|
48 |
String userDir = System.getProperty("user.home"); |
|
49 |
boolean debug = false |
|
50 |
def MONITOR; |
|
51 |
BaseParameters params; |
|
52 |
try {params = paramsBinding;MONITOR=monitor} catch (Exception) |
|
53 |
{ println "DEV MODE";//exception means we debug |
|
54 |
debug = true |
|
55 |
params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml")) |
|
56 |
params.load() |
|
57 |
if (!org.txm.Toolbox.isInitialized()) { |
|
58 |
Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM")); |
|
59 |
Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger")); |
|
60 |
Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models")); |
|
61 |
Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8"); |
|
62 |
Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ","); |
|
63 |
Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\""); |
|
64 |
Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM")); |
|
65 |
} |
|
66 |
} |
|
67 |
if (params == null) { println "no parameters. Aborting"; return; } |
|
68 |
|
|
69 |
String corpusname = params.getCorpusName(); |
|
70 |
Element corpusElem = params.corpora.get(corpusname); |
|
71 |
String basename = params.name; |
|
72 |
String rootDir = params.rootDir; |
|
73 |
String lang = corpusElem.getAttribute("lang"); |
|
74 |
String model = lang |
|
75 |
String encoding = corpusElem.getAttribute("encoding"); |
|
76 |
boolean annotate = "true" == corpusElem.getAttribute("annotate"); |
|
77 |
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl") |
|
78 |
def xslParams = params.getXsltParams(corpusElem); |
|
79 |
|
|
80 |
File txmSrcDir = new File(rootDir, "xml-txm"); |
|
81 |
File tigerSrcDir = new File(rootDir, "tiger-xml"); |
|
82 |
File headerFile = new File(rootDir, "tiger-xml/header.xml"); |
|
83 |
|
|
84 |
File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename); |
|
85 |
|
|
86 |
binDir.deleteDir(); |
|
87 |
binDir.mkdirs(); |
|
88 |
if (!binDir.exists()) { |
|
89 |
println "Could not create binDir "+binDir |
|
90 |
return; |
|
91 |
} |
|
92 |
if (!txmSrcDir.exists()) { |
|
93 |
println "Error: txm src dir does not exists: "+txmSrcDir |
|
94 |
return; |
|
95 |
} |
|
96 |
if (!tigerSrcDir.exists()) { |
|
97 |
println "Error: tiger src dir does not exists: "+tigerSrcDir |
|
98 |
return; |
|
99 |
} |
|
100 |
File txmDir = new File(binDir,"txm/$corpusname"); |
|
101 |
txmDir.deleteDir(); |
|
102 |
txmDir.mkdirs(); |
|
103 |
// copy txm files |
|
104 |
List<File> srcfiles = txmSrcDir.listFiles(); |
|
105 |
for (File f : srcfiles) {// check XML format, and copy file into binDir |
|
106 |
if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties")) |
|
107 |
continue; |
|
108 |
if (ValidateXml.test(f)) { |
|
109 |
FileCopy.copy(f, new File(txmDir, f.getName())); |
|
110 |
} else { |
|
111 |
println "Won't process file "+f; |
|
112 |
} |
|
113 |
} |
|
114 |
if (txmDir.listFiles() == null) { |
|
115 |
println "No txm file to process" |
|
116 |
return; |
|
117 |
} |
|
118 |
|
|
119 |
List<File> filelist = txmDir.listFiles(); |
|
120 |
Collections.sort(filelist); |
|
121 |
|
|
122 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
123 |
if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
|
124 |
println "-- COMPILING - Building Search Engine indexes" |
|
125 |
println "binDir: $binDir" |
|
126 |
println "txmDir: $txmDir" |
|
127 |
def c = new compiler(); |
|
128 |
if (debug) c.setDebug(); |
|
129 |
c.setLang(lang); |
|
130 |
//c.setSortMetadata(sortMetadata) |
|
131 |
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) { |
|
132 |
println "import process stopped"; |
|
133 |
return; |
|
134 |
} |
|
135 |
|
|
136 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
137 |
if (MONITOR != null) MONITOR.worked(20, "EDITION") |
|
138 |
println "-- EDITION - Building edition" |
|
139 |
new File(binDir,"HTML/$corpusname").deleteDir(); |
|
140 |
new File(binDir,"HTML/$corpusname").mkdirs(); |
|
141 |
File outdir = new File(binDir,"/HTML/$corpusname/default/"); |
|
142 |
outdir.mkdirs(); |
|
143 |
|
|
144 |
|
|
145 |
def second = 0 |
|
146 |
println "Paginating text: " |
|
147 |
for (File srcfile : filelist) { |
|
148 |
String txtname = srcfile.getName(); |
|
149 |
int i = txtname.lastIndexOf("."); |
|
150 |
if(i > 0) txtname = txtname.substring(0, i); |
|
151 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
|
152 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
|
153 |
Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile); |
|
154 |
if (second) { print(", ") } |
|
155 |
if (second > 0 && (second++ % 5) == 0) println "" |
|
156 |
print(srcfile.getName()); |
|
157 |
def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb"); |
|
158 |
Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html"); |
|
159 |
for (i = 0 ; i < ed.getPageFiles().size();) { |
|
160 |
File f = ed.getPageFiles().get(i); |
|
161 |
String wordid = ed.getIdx().get(i); |
|
162 |
params.addPage(edition, ""+(++i), wordid); |
|
163 |
} |
|
164 |
} |
|
165 |
|
|
166 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
167 |
if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
|
168 |
File paramFile = new File(binDir, "import.xml"); |
|
169 |
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true; |
|
170 |
|
|
171 |
|
|
172 |
// TMR Modifications start here |
|
173 |
File tigerDir = new File(binDir, "tiger"); |
|
174 |
tigerDir.mkdir(); |
|
175 |
File master = new File(tigerSrcDir, "master.xml"); |
|
176 |
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file |
|
177 |
|
|
178 |
// Run SRCMF cmd line corpus import script |
|
179 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
180 |
def importer = new SrcmfImporter() |
|
181 |
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir) |
|
182 |
|
|
183 |
// Create TigerXml-POS-PNC |
|
184 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
185 |
println "create TIGER XML POS PNC" |
|
186 |
PunctInjectImport injector; |
|
187 |
if (Toolbox.isInitialized()) { |
|
188 |
// get idx from CQP |
|
189 |
// INCOMPLETE |
|
190 |
// def injector = new PunctInjectImport(tmpFile, words) |
|
191 |
println "Error: the loader is not ready to be launched from RCP" |
|
192 |
return; |
|
193 |
} else { |
|
194 |
injector = new PunctInjectImport(tigerXmlAll, txmSrcFile) |
|
195 |
} |
|
196 |
injector.outputFile = new File(tigerDir, "TigerPnc.xml") |
|
197 |
injector.process() |
|
198 |
|
|
199 |
// Create the tigersearch.logprop file (used to launch TIGERSearch) |
|
200 |
File logprop = new File(tigerDir, "tigersearch.logprop"); |
|
201 |
logprop.withWriter("UTF-8") { writer -> |
|
202 |
writer.write("""# Default log configuration of the TIGERSearch suite |
|
203 |
|
|
204 |
log4j.rootLogger=WARN,Logfile |
|
205 |
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO |
|
206 |
|
|
207 |
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender |
|
208 |
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log |
|
209 |
log4j.appender.Logfile.MaxFileSize=500KB |
|
210 |
log4j.appender.Logfile.MaxBackupIndex=1 |
|
211 |
|
|
212 |
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout |
|
213 |
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""") |
|
214 |
} |
|
215 |
|
|
216 |
// Run TigerRegistry |
|
217 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
218 |
BasicConfigurator.configure(); |
|
219 |
String uri = tigerXmlAll.getAbsolutePath(); |
|
220 |
File tigerBinDir = new File(tigerDir, corpusname) |
|
221 |
tigerBinDir.mkdir() |
|
222 |
try { |
|
223 |
IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()); |
|
224 |
XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false); |
|
225 |
indexing.startIndexing(); |
|
226 |
} |
|
227 |
catch (IOException e) { System.out.println("IO: "+e.getMessage()); } |
|
228 |
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); } |
|
229 |
|
|
230 |
tigerXmlAll.delete() |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TigerXMLHeader.xsd (revision 1000) | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> |
|
3 |
|
|
4 |
<!-- ======================================================================= |
|
5 |
XML SubSchema for the header part of the TIGER-XML format |
|
6 |
http://www.ims.uni-stuttgart.de/projekte/TIGER/publicTigerXMLHeader.xsd |
|
7 |
======================================================================= |
|
8 |
TIGER Project, Wolfgang Lezius |
|
9 |
IMS, University of Stuttgart, 04/01/2003 |
|
10 |
======================================================================= --> |
|
11 |
|
|
12 |
|
|
13 |
<!-- ====================================================== |
|
14 |
DECLARATION OF THE HEADER |
|
15 |
====================================================== --> |
|
16 |
|
|
17 |
|
|
18 |
<!-- declaration of the head element --> |
|
19 |
|
|
20 |
<xs:element name="head" type="headType"/> |
|
21 |
|
|
22 |
|
|
23 |
<!-- declaration of the header type --> |
|
24 |
|
|
25 |
<xs:complexType name="headType"> |
|
26 |
|
|
27 |
<xs:sequence> |
|
28 |
<xs:element name="meta" type="metaType" minOccurs="0" maxOccurs="1"/> |
|
29 |
<xs:element name="annotation" type="annotationType" minOccurs="0" maxOccurs="1"/> |
|
30 |
<xs:/sequence> |
|
31 |
|
|
32 |
<!-- optional: reference to external header file |
|
33 |
|
|
34 |
The header of a TigerXML corpus can also be stored in separate file. |
|
35 |
This attribute points to the external header file. The pointer is |
|
36 |
an URI. Examples: file:relative.xml or file:/path/to/absolute.xml |
|
37 |
|
|
38 |
Note: If there is a pointer to an external file, the head |
|
39 |
element must be empty. --> |
|
40 |
|
|
41 |
<xs:attribute name="external" type="xsd:anyURI"/> |
|
42 |
|
|
43 |
<xs:/complexType> |
|
44 |
|
|
45 |
|
|
46 |
<!-- declaration of the meta information type --> |
|
47 |
|
|
48 |
<xs:complexType name="metaType"> |
|
49 |
|
|
50 |
<xs:sequence> |
|
51 |
<xs:element name="name" type="xsd:string" minOccurs="0" maxOccurs="1"/> |
|
52 |
<xs:element name="author" type="xsd:string" minOccurs="0" maxOccurs="1"/> |
|
53 |
<xs:element name="date" type="xsd:string" minOccurs="0" maxOccurs="1"/> |
|
54 |
<xs:element name="description" type="xsd:string" minOccurs="0" maxOccurs="1"/> |
|
55 |
<xs:element name="format" type="xsd:string" minOccurs="0" maxOccurs="1"/> |
|
56 |
<xs:element name="history" type="xsd:string" minOccurs="0" maxOccurs="1"/> |
|
57 |
<xs:/sequence> |
|
58 |
|
|
59 |
<xs:/complexType> |
|
60 |
|
|
61 |
|
|
62 |
<!-- declaration of the annotation type --> |
|
63 |
|
|
64 |
<xs:complexType name="annotationType"> |
|
65 |
|
|
66 |
<xs:sequence> |
|
67 |
<xs:element name="feature" type="featureType" minOccurs="1" maxOccurs="unbounded"/> |
|
68 |
<xs:element name="edgelabel" type="edgelabelType" minOccurs="0" maxOccurs="1"/> |
|
69 |
<xs:element name="secedgelabel" type="edgelabelType" minOccurs="0" maxOccurs="1"/> |
|
70 |
<xs:/sequence> |
|
71 |
|
|
72 |
<xs:/complexType> |
|
73 |
|
|
74 |
|
|
75 |
<!-- declaration of the feature type --> |
|
76 |
|
|
77 |
<xs:complexType name="featureType"> |
|
78 |
|
|
79 |
<xs:sequence> |
|
80 |
<xs:element name="value" type="featurevalueType" minOccurs="0" maxOccurs="unbounded"/> |
|
81 |
<xs:/sequence> |
|
82 |
|
|
83 |
<xs:attribute name="name" type="featurenameType" use="required"/> |
|
84 |
|
|
85 |
<xs:attribute name="domain" use="required"> |
|
86 |
<xs:simpleType> |
|
87 |
<xs:restriction base="xsd:string"> |
|
88 |
<xs:enumeration value="T"/> <!-- feature for terminal nodes --> |
|
89 |
<xs:enumeration value="NT"/> <!-- feature for nonterminal nodes --> |
|
90 |
<xs:enumeration value="FREC"/> <!-- feature for both --> |
|
91 |
<xs:/restriction> |
|
92 |
<xs:/simpleType> |
|
93 |
<xs:/attribute> |
|
94 |
|
|
95 |
<xs:/complexType> |
|
96 |
|
|
97 |
|
|
98 |
<!-- declaration of the (secondary) edge label type --> |
|
99 |
|
|
100 |
<xs:complexType name="edgelabelType"> |
|
101 |
|
|
102 |
<xs:sequence> |
|
103 |
<xs:element name="value" type="featurevalueType" minOccurs="0" maxOccurs="unbounded"/> |
|
104 |
<xs:/sequence> |
|
105 |
|
|
106 |
<xs:/complexType> |
|
107 |
|
|
108 |
|
|
109 |
<!-- declaration of the feature value type --> |
|
110 |
|
|
111 |
<xs:complexType name="featurevalueType"> |
|
112 |
|
|
113 |
<xs:simpleContent> <!-- element content: documentation of the feature value --> |
|
114 |
<xs:extension base="xsd:string"> |
|
115 |
<xs:attribute name="name" type="xsd:string"/> |
|
116 |
<xs:/extension> |
|
117 |
<xs:/simpleContent> |
|
118 |
|
|
119 |
|
|
120 |
<xs:/complexType> |
|
121 |
|
|
122 |
|
|
123 |
<!-- ====================================================== |
|
124 |
HEADER DECLARATIONS THAT SHOULD BE REFINED |
|
125 |
====================================================== --> |
|
126 |
|
|
127 |
<!-- declaration of the FEATURE NAMES used in the corpus header; |
|
128 |
this type is unrestricted, but should be refined by a |
|
129 |
specialised, corpus-dependent schema --> |
|
130 |
|
|
131 |
<xs:simpleType name="featurenameType"> |
|
132 |
|
|
133 |
<xs:restriction base="xsd:string"> |
|
134 |
<xs:minLength value="1"/> |
|
135 |
<xs:maxLength value="20"/> |
|
136 |
<xs:whiteSpace value="preserve"/> |
|
137 |
<xs:/restriction> |
|
138 |
|
|
139 |
<xs:/simpleType> |
|
140 |
|
|
141 |
|
|
142 |
<xs:/schema> |
|
0 | 143 |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TSImport.groovy (revision 1000) | ||
---|---|---|
1 |
package org.txm.scripts.importer.tigersearch; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.util.ArrayList; |
|
5 |
|
|
6 |
import ims.tiger.index.writer.* |
|
7 |
import ims.tiger.system.* |
|
8 |
|
|
9 |
import org.txm.Toolbox; |
|
10 |
import org.txm.importer.ApplyXsl2; |
|
11 |
import org.txm.importer.xtz.* |
|
12 |
import org.txm.objects.BaseParameters |
|
13 |
import org.txm.utils.BundleUtils; |
|
14 |
import org.txm.utils.io.FileCopy; |
|
15 |
import org.apache.log4j.BasicConfigurator; |
|
16 |
import org.txm.importer.xtz.* |
|
17 |
import org.txm.scripts.importer.xtz.* |
|
18 |
|
|
19 |
class TSImport extends XTZImport { |
|
20 |
|
|
21 |
public TSImport(BaseParameters params) { |
|
22 |
super(params); |
|
23 |
} |
|
24 |
|
|
25 |
@Override |
|
26 |
public void init(BaseParameters p) { |
|
27 |
super.init(p); |
|
28 |
|
|
29 |
importer = new TSImporter(this); // only to build metadata |
|
30 |
compiler = new XTZCompiler(this) |
|
31 |
annotater = null; // no annotater step to do |
|
32 |
pager = new XTZPager(this) |
|
33 |
} |
|
34 |
|
|
35 |
/** |
|
36 |
* Do a XTZ Import then build the TIGERSearch indexes in the binary corpus "tiger" directory |
|
37 |
*/ |
|
38 |
@Override |
|
39 |
public void start() throws InterruptedException { |
|
40 |
File tigerSrcDir = sourceDirectory |
|
41 |
|
|
42 |
def xmlFiles = [] |
|
43 |
sourceDirectory.listFiles(new FileFilter() { |
|
44 |
boolean accept(File file) { |
|
45 |
if (file.isDirectory()) return false; |
|
46 |
if (file.isHidden()) return false; |
|
47 |
if (file.getName().equals("import.xml")) return false; |
|
48 |
if (!file.getName().endsWith(".xml")) return false; |
|
49 |
|
|
50 |
xmlFiles << file |
|
51 |
} |
|
52 |
}); |
|
53 |
|
|
54 |
xmlFiles.remove(new File(sourceDirectory, "import.xml")) |
|
55 |
|
|
56 |
if (xmlFiles.size() == 0) { |
|
57 |
println "Error no XML file found in $sourceDirectory" |
|
58 |
isSuccessful = false; |
|
59 |
return; |
|
60 |
} |
|
61 |
|
|
62 |
File master = xmlFiles[0]; |
|
63 |
println "Main TIGER XML file found: $master" |
|
64 |
|
|
65 |
File tsXSLFile = new File(Toolbox.getTXMHOMEPATH(), "xsl/ts.xsl"); |
|
66 |
BundleUtils.copyFiles("TIGERSearchRCP", "src", "org/txm/importer/tigersearch", "ts.xsl", tsXSLFile.getParentFile()); |
|
67 |
|
|
68 |
File xmltxmSrcDir = new File(binaryDirectory, "src"); // output directory of the TS XSL transformation |
|
69 |
xmltxmSrcDir.mkdirs(); |
|
70 |
FileCopy.copy(master, new File(xmltxmSrcDir, master.getName())); |
|
71 |
|
|
72 |
if (!ApplyXsl2.processImportSources(tsXSLFile, xmltxmSrcDir, xmltxmSrcDir)) { |
|
73 |
println "Error while applying TS XSL file to $tigerSrcDir" |
|
74 |
isSuccessful = false; |
|
75 |
return; |
|
76 |
} |
|
77 |
|
|
78 |
File[] files = xmltxmSrcDir.listFiles(); |
|
79 |
if (files == null || files.length == 0) { |
|
80 |
println "Error while applying TS XSL file to $xmltxmSrcDir is empty" |
|
81 |
isSuccessful = false; |
|
82 |
return; |
|
83 |
} |
|
84 |
|
|
85 |
sourceDirectory = xmltxmSrcDir; // hop |
|
86 |
File txmDir = new File(binaryDirectory, "txm/"+corpusName); |
|
87 |
txmDir.mkdirs(); |
|
88 |
FileCopy.copyFiles(sourceDirectory, txmDir) // the compiler step will use these files |
|
89 |
|
|
90 |
super.start(); // call the usual XTZ import |
|
91 |
|
|
92 |
if (isSuccessful) { |
|
93 |
|
|
94 |
File tigerDir = new File(binaryDirectory, "tiger"); |
|
95 |
tigerDir.mkdir(); |
|
96 |
|
|
97 |
|
|
98 |
File logprop = new File(tigerDir, "tigersearch.logprop"); |
|
99 |
|
|
100 |
logprop.withWriter("UTF-8") { writer -> |
|
101 |
writer.write("""# Default log configuration of the TIGERSearch suite |
|
102 |
log4j.rootLogger=WARN,Logfile |
|
103 |
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=WARNING |
|
104 |
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender |
Formats disponibles : Unified diff