Révision 1000
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/TigerSubcorpus2Main.groovy (revision 1000) | ||
|---|---|---|
| 1 | /* | |
| 2 | Combines subcorpus files to give a single main file. | |
| 3 | */ | |
| 4 | // TXM package statement | |
| 5 | package org.txm.scripts.importer.srcmf | |
| 6 |  | |
| 7 | // Command-line entry point | |
| 8 |  | |
| 9 | def cli = new CliBuilder( | |
| 10 | usage:'TigerSubcorpus2Main.groovy [options] tiger_master.xml' | |
| 11 | ) | |
| 12 | cli.h(longOpt:'help', 'Prints this message.') | |
| 13 | cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') | |
| 14 | options = cli.parse(args) | |
| 15 | if (options.arguments().size() != 1) {
 | |
| 16 | println 'Incorrect number of command line arguments... exiting' | |
| 17 | println cli.usage() | |
| 18 | System.exit(2) | |
| 19 | } | |
| 20 | def tigerMaster = new File(options.arguments()[0]) | |
| 21 | def outputFile = new File('outputfile.xml')
 | |
| 22 | if (options.o) {
 | |
| 23 | outputFile = new File(options.o) | |
| 24 | } | |
| 25 | script(tigerMaster, outputFile) | |
| 26 |  | |
| 27 | def script (File tigerMaster, File outputFile) {
 | |
| 28 | def masterDoc = new XmlParser().parse(tigerMaster) | |
| 29 |     for (def body : masterDoc.body) {
 | |
| 30 | 	for (def subNode : body.subcorpus) {
 | |
| 31 | 		File subCorpusFile = new File(tigerMaster.getParentFile(), (""+subNode.@external).substring(5))
 | |
| 32 | def subDoc = new XmlParser().parse(subCorpusFile) | |
| 33 | body.remove(subNode) | |
| 34 | for (def child : subDoc.children()) | |
| 35 | body.append(child) | |
| 36 | } | |
| 37 | } | |
| 38 | def strWriter = new StringWriter() | |
| 39 | new groovy.util.XmlNodePrinter(new PrintWriter(strWriter)).print(masterDoc) | |
| 40 | def rez = strWriter.toString() | |
| 41 |     outputFile.withWriter("UTF-8") { writer ->
 | |
| 42 | 	writer.println('<?xml version="1.0" encoding="UTF-8"?>')
 | |
| 43 | writer.print(rez) | |
| 44 | } | |
| 45 | } | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/package.html (revision 1000) | ||
|---|---|---|
| 1 | <html> | |
| 2 | <body> | |
| 3 | <p>TIGERSearch import module. This is a prototype that can only manage SRCMF TIGERSearch sources</p> | |
| 4 | </body> | |
| 5 | </html> | |
| 0 | 6 | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/MasterReheader.groovy (revision 1000) | ||
|---|---|---|
| 1 | #! /usr/bin/groovy | |
| 2 | package org.txm.scripts.importer.srcmf; | |
| 3 |  | |
| 4 | /* | |
| 5 | * To change this template, choose Tools | Templates | |
| 6 | * and open the template in the editor. | |
| 7 | */ | |
| 8 |  | |
| 9 | // Set up globals | |
| 10 | // def masterFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos.xml')
 | |
| 11 | // def outputFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos2.xml')
 | |
| 12 | // def headerFile = new File('/home/tomr/Documents/Work/SRCMF/srcmf_ts/header_srcmf.xml')
 | |
| 13 | // def feats = [nt:['cat', 'type', 'coord'], t:['pos', 'form', 'q']] | |
| 14 | // def firstFeat = [t:'word', nt:'cat'] | |
| 15 |  | |
| 16 | // Command-line entry point | |
| 17 | def cli = new CliBuilder( | |
| 18 | usage:'MasterReheader.groovy [options] master_file.xml header_file.xml' | |
| 19 | ) | |
| 20 | cli.h(longOpt:'help', 'Prints this message.') | |
| 21 | cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') | |
| 22 | cli.nt(longOpt:'ntfeats', args:1, argName:'ntfeats', 'NT features for which to provide value node.') | |
| 23 | cli.t(longOpt:'tfeats', args:1, argName:'tfeats', 'T features for which to provide value node.') | |
| 24 | cli.nt1(longOpt:'ntfeat1st', args:1, argName:'1st ntfeat', 'First NT feature listed in header.') | |
| 25 | cli.t1(longOpt:'tfeat1st', args:1, argName:'1st tfeat', 'First T feature listed in header.') | |
| 26 | options = cli.parse(args) | |
| 27 | if (options.arguments().size() == 2) {
 | |
| 28 | def masterFile = new File(options.arguments()[0]) | |
| 29 | def headerFile = new File(options.arguments()[1]) | |
| 30 | def masterFolder = masterFile.getCanonicalFile().getParent() | |
| 31 | def outputFile = null | |
| 32 |     if (options.o) {
 | |
| 33 | outputFile = new File(options.o) | |
| 34 |     } else {
 | |
| 35 | outputFile = new File(masterFolder, 'MasterReheader_out.xml') | |
| 36 | } | |
| 37 | def ntfirst = 'cat' | |
| 38 |     if (options.nt1) {
 | |
| 39 | ntfirst = options.nt1 | |
| 40 | } | |
| 41 | def tfirst = 'word' | |
| 42 |     if (options.t1) {
 | |
| 43 | tfirst = options.t1 | |
| 44 | } | |
| 45 | script( | |
| 46 | masterFile, headerFile, outputFile, | |
| 47 | ['nt':options.nts, 't':options.ts], | |
| 48 | ['nt':options.nt1, 't':options.t1] | |
| 49 | ) | |
| 50 | } else {
 | |
| 51 | println 'Incorrect number of command line arguments... exiting' | |
| 52 | println cli.usage() | |
| 53 | } | |
| 54 |  | |
| 55 | def script( | |
| 56 | File masterFile, File headerFile, File outputFile, HashMap feats, HashMap firstFeat | |
| 57 | ) {
 | |
| 58 | // Load master and header files | |
| 59 | def master = new XmlParser().parse(masterFile) | |
| 60 | def header = new XmlParser().parse(headerFile) | |
| 61 | def masterFolder = masterFile.getCanonicalFile().getParent() | |
| 62 |  | |
| 63 | // Set up locals | |
| 64 | def attrVal = [nt:[:], t:[:]] | |
| 65 |  | |
| 66 | // Scan subcorpus files and build attribute lists. | |
| 67 |     master.body.subcorpus.each { 
 | |
| 68 | def subcorpusFile = new File (masterFolder, it.'@external'[5..-1]) | |
| 69 | def subcorpus = new XmlParser().parse(subcorpusFile) | |
| 70 | // Closure for t & nt nodes processing. | |
| 71 |         def getvals = { node, type ->
 | |
| 72 |             node.attributes().each { mEntry ->
 | |
| 73 |                 if (! attrVal[type].keySet().contains(mEntry.getKey())) {
 | |
| 74 | attrVal[type][mEntry.getKey()] = new HashSet() | |
| 75 | } | |
| 76 | attrVal[type][mEntry.getKey()].add(mEntry.getValue()) | |
| 77 | } | |
| 78 | } | |
| 79 |         subcorpus.s.graph.terminals.t.each { getvals.call(it, 't') }
 | |
| 80 |         subcorpus.s.graph.nonterminals.nt.each { getvals.call(it, 'nt') }
 | |
| 81 | } | |
| 82 | // Id isn't an attribute in the header. | |
| 83 |     attrVal['t'].remove('id')
 | |
| 84 |     attrVal['nt'].remove('id')
 | |
| 85 | // Remove old feature nodes in master file | |
| 86 | def oldFeatureNodes = master.head.annotation.feature | |
| 87 |     while (oldFeatureNodes) {
 | |
| 88 | node = oldFeatureNodes.pop() | |
| 89 | node.parent().remove(node) | |
| 90 | } | |
| 91 | assert (! master.head.annotation.feature) | |
| 92 | // Check firstFeat was relevant | |
| 93 |     ['t', 'nt'].each { type ->
 | |
| 94 |         if (! (attrVal[type].keySet().contains(firstFeat[type]))) {
 | |
| 95 | firstFeat[type] = attrVal[type].keySet().sort()[0] | |
| 96 | } | |
| 97 | } | |
| 98 | assert attrVal['t'].keySet().contains(firstFeat['t']) | |
| 99 | assert attrVal['nt'].keySet().contains(firstFeat['nt']) | |
| 100 | def featList = [:] | |
| 101 |     ['t', 'nt'].each { type ->
 | |
| 102 | featList[type] = [firstFeat[type]] | |
| 103 |         featList[type].addAll(attrVal[type].keySet().findAll { it != firstFeat[type] })
 | |
| 104 | } | |
| 105 | // Add new feature and value nodes | |
| 106 |     ['t', 'nt'].each { type ->
 | |
| 107 |         featList[type].each { feat ->
 | |
| 108 | def fNode = new Node(master.head.annotation[0], 'feature', | |
| 109 | ['domain':type.toUpperCase(), 'name':feat] | |
| 110 | ) | |
| 111 | // Add value node if the node value is given in 'feats' | |
| 112 |             if (feats[type].contains(feat)) {
 | |
| 113 |                 attrVal[type][feat].each { value ->
 | |
| 114 | assert header.'**'.feature | |
| 115 | assert header.'**'.feature[0].'@name' | |
| 116 | assert header.'**'.feature[0].'@domain' | |
| 117 | assert ['NT', 'T'].contains(header.'**'.feature[0].'@domain') | |
| 118 |                     def hFNode = header.'**'.feature.find {
 | |
| 119 | it.'@name' == feat && ( | |
| 120 | it.'@domain' == type.toUpperCase() || it.'@domain' == 'FREC' | |
| 121 | ) | |
| 122 | } | |
| 123 | def vText = '[unknown]' | |
| 124 |                     if (hFNode && hFNode.value.find { it.'@name' == value }) {
 | |
| 125 |                         vText = hFNode.value.find { it.'@name' ==  value }.text()
 | |
| 126 | } | |
| 127 | new Node(fNode, 'value', ['name':value], vText) | |
| 128 | } | |
| 129 | } | |
| 130 | } | |
| 131 | } | |
| 132 |  | |
| 133 | // Save to output_file | |
| 134 |     outputFile.withWriter { writer ->
 | |
| 135 | writer << groovy.xml.XmlUtil.serialize(master) | |
| 136 | } | |
| 137 | } | |
| 138 |  | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/SrcmfImporter.groovy (revision 1000) | ||
|---|---|---|
| 1 | /* | |
| 2 | * Calls all import scripts | |
| 3 | */ | |
| 4 |  | |
| 5 | /** | |
| 6 | * | |
| 7 | * @author tmr | |
| 8 | */ | |
| 9 |  | |
| 10 | // Command line form of import statements: | |
| 11 | // import MasterReheader | |
| 12 | // import PunctInjectImport | |
| 13 | // import SubcorpusDataInject | |
| 14 | // import TigerSubcorpus2Main | |
| 15 | // import java.util.logging.FileHandler | |
| 16 | // import javax.xml.parsers.DocumentBuilderFactory | |
| 17 |  | |
| 18 | // TXM package statement | |
| 19 | package org.txm.scripts.importer.srcmf | |
| 20 |  | |
| 21 | import java.util.logging.*; | |
| 22 |  | |
| 23 |  | |
| 24 | // Command line entry point | |
| 25 | def cli = new CliBuilder( | |
| 26 | usage:'SrcmfImport.groovy [options] tiger_master.xml xml_txm.xml header_file.xml' | |
| 27 | ) | |
| 28 | cli.h(longOpt:'help', 'Prints this message.') | |
| 29 | options = cli.parse(args) | |
| 30 | if (options.arguments().size() != 3) {
 | |
| 31 | println 'Incorrect number of command line arguments... exiting' | |
| 32 | println cli.usage() | |
| 33 | System.exit(2) | |
| 34 | } | |
| 35 |  | |
| 36 | def tigerFile = new File(options.arguments()[0]) | |
| 37 | def txmFile = new File(options.arguments()[1]) | |
| 38 | def headerFile = new File(options.arguments()[2]) | |
| 39 | def tigerXmlAll = doAllButPnc( | |
| 40 | tigerFile, | |
| 41 | txmFile, | |
| 42 | headerFile, | |
| 43 | txmFile.getAbsoluteFile().getParentFile().getParentFile() | |
| 44 | ) | |
| 45 | doPnc(tigerXmlAll, txmFile) | |
| 46 | tigerXmlAll.delete() | |
| 47 |  | |
| 48 | def doAllButPnc(File tigerFile, File txmFile, File headerFile, File binDir) {
 | |
| 49 | // Run pos injection script | |
| 50 | File txmSrcDir = txmFile.getAbsoluteFile().getParentFile() | |
| 51 | File tigerDir = new File(binDir, "tiger") | |
| 52 | tigerDir.mkdir() | |
| 53 | File masterpos = new File(tigerDir, "master_pos.xml") | |
| 54 | File xmltxm = txmSrcDir.listFiles()[0] | |
| 55 | File logFile = new File(binDir, "tiger.log") | |
| 56 | def sdi = new SubcorpusDataInject( | |
| 57 | xmltxm, | |
| 58 | new FileHandler(logFile.getAbsolutePath()), "vers" | |
| 59 | ) | |
| 60 | sdi.processMaster(tigerFile, masterpos) | |
| 61 | // Run reheader script | |
| 62 | def reheader = new MasterReheader() | |
| 63 |     File tmp = File.createTempFile("tmp", ".xml",tigerDir)
 | |
| 64 | def feats = ['nt':['cat', 'type', 'coord'], 't':['pos', 'form', 'q']] | |
| 65 | def firstFeat = ['nt':'cat', 't':'word'] | |
| 66 | reheader.script(masterpos, headerFile, tmp, feats, firstFeat) | |
| 67 |     if (!tmp.exists()) {
 | |
| 68 | println "Error: reheader failed" | |
| 69 | } | |
| 70 | masterpos.delete() | |
| 71 | tmp.renameTo(masterpos) | |
| 72 | // Run merge master & subcorpus script | |
| 73 | def tigerXmlAll = new File(masterpos.getParentFile(), "TigerAll.xml") | |
| 74 | def mergescript = new TigerSubcorpus2Main() | |
| 75 | mergescript.script(masterpos, tigerXmlAll) | |
| 76 | return tigerXmlAll | |
| 77 | } | |
| 78 |  | |
| 79 | def doPnc(File tigerXmlAll, File txmFile) {
 | |
| 80 | injector = new PunctInjectImport(tigerXmlAll, txmFile) | |
| 81 | injector.outputFile = new File(tigerXmlAll.getParentFile(), "TigerPnc.xml") | |
| 82 | injector.process() | |
| 83 | } | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/SubcorpusDataInject.groovy (revision 1000) | ||
|---|---|---|
| 1 | #! /usr/bin/groovy | |
| 2 | package org.txm.scripts.importer.srcmf; | |
| 3 | /* | |
| 4 | * To change this template, choose Tools | Templates | |
| 5 | * and open the template in the editor. | |
| 6 | */ | |
| 7 |  | |
| 8 | import java.util.logging.Logger | |
| 9 | import java.util.logging.FileHandler | |
| 10 | import java.util.logging.SimpleFormatter | |
| 11 | import java.util.logging.Level | |
| 12 |  | |
| 13 |  | |
| 14 | class SubcorpusDataInject {
 | |
| 15 | // Globals | |
| 16 | def textForm = 'prose' | |
| 17 | def TEI = null | |
| 18 | def wNodes = [] | |
| 19 | def wNodesInQ = [] | |
| 20 | // Set up namespaces | |
| 21 | 	static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
 | |
| 22 | 	static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm')
 | |
| 23 | // Set up a logger | |
| 24 | 	static logger = Logger.getLogger('subcorpusDataInjectLog')
 | |
| 25 | // END GLOBALS | |
| 26 |  | |
| 27 | SubcorpusDataInject ( | |
| 28 | File xmlTxmFile, | |
| 29 | FileHandler fh, | |
| 30 | String newTextForm | |
| 31 | 	) {
 | |
| 32 | // Reset global values | |
| 33 | 		if (newTextForm) {
 | |
| 34 | textForm = newTextForm | |
| 35 | } | |
| 36 | // Set up logger | |
| 37 | def formatter = new SimpleFormatter() | |
| 38 | logger.addHandler(fh) | |
| 39 | logger.setLevel(Level.ALL) | |
| 40 | fh.setFormatter(formatter) | |
| 41 | // Parse the XML-TXM file | |
| 42 | logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).") | |
| 43 | TEI = new XmlParser().parse(xmlTxmFile) | |
| 44 | // Get text form from TEI header (if expressed) | |
| 45 | logger.log(Level.INFO, "Reading TEI header to detect text form.") | |
| 46 | 		if (TEI.teiHeader.profileDesc.textClass.catRef) {
 | |
| 47 | def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1] | |
| 48 | 			textForm = TEI.teiHeader.encodingDesc.'**'.category.find {
 | |
| 49 | it.'@id' == catId || it.attribute(nsXml.id) == catId | |
| 50 | }.catDesc[0].text() | |
| 51 | logger.log(Level.INFO, "Found text form '$textForm' in TEI header.") | |
| 52 | 		} else {
 | |
| 53 | logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.") | |
| 54 | } | |
| 55 | // Convert Lbs to Property | |
| 56 | 		if (textForm == 'vers' || textForm == 'mixte') {
 | |
| 57 | logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.") | |
| 58 | def undone = TEI.text.body | |
| 59 | def lastLb = false | |
| 60 | def lastWord = null | |
| 61 | 			while (undone) {
 | |
| 62 | def aNode = undone.remove(0) | |
| 63 | 				if (aNode.name().getLocalPart() == 'lb') {
 | |
| 64 | lastLb = true | |
| 65 | 					if (lastWord) {
 | |
| 66 | lastWord.'@lb' = 'end' | |
| 67 | } | |
| 68 | 				} else if (aNode.name().getLocalPart() == 'w') {
 | |
| 69 | 					if (!punctTest(aNode)) {
 | |
| 70 | 						if (lastLb) {
 | |
| 71 | aNode.'@lb' = 'start' | |
| 72 | } | |
| 73 | lastWord = aNode | |
| 74 | lastLb = false | |
| 75 | } | |
| 76 | 				} else {
 | |
| 77 | 					undone.addAll(0, aNode.children().findAll { it.getClass() == Node })
 | |
| 78 | } | |
| 79 | } | |
| 80 | } | |
| 81 | // Build node lists | |
| 82 | logger.log(Level.INFO, "Building node lists.") | |
| 83 | 		wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' }
 | |
| 84 | 		wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' }
 | |
| 85 | } | |
| 86 |  | |
| 87 | 	static main(def args) {
 | |
| 88 | // Parse command line args | |
| 89 | def outputFile = 'test-out.xml' | |
| 90 | def logName = 'subcorpusDataInjectLog.log' | |
| 91 | def cli = new CliBuilder( | |
| 92 | usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml' | |
| 93 | ) | |
| 94 | cli.h(longOpt:'help', 'Prints this message') | |
| 95 | cli.m(longOpt:'master', 'Specifies input file is a TS master file.') | |
| 96 | cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.') | |
| 97 | cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.') | |
| 98 | cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file') | |
| 99 | println args | |
| 100 | def options = cli.parse(args) | |
| 101 | // Call XML TXM loader | |
| 102 | 		if (options.arguments().size() == 2) {
 | |
| 103 | def arglist = [new File (options.arguments()[0])] | |
| 104 | 			if (options.logfile) {
 | |
| 105 | arglist.add(new FileHandler(options.logfile)) | |
| 106 | 			} else {
 | |
| 107 | arglist.add(new FileHandler(logName)) | |
| 108 | } | |
| 109 | 			if (options.v) {
 | |
| 110 | 				arglist.add('vers')
 | |
| 111 | 			} else {
 | |
| 112 | 				arglist.add('')
 | |
| 113 | } | |
| 114 | def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2]) | |
| 115 | // Call process master or process subcorpus | |
| 116 | arglist = [new File(options.arguments()[1]).getCanonicalFile()] | |
| 117 | 			if (options.output) {
 | |
| 118 | arglist.add(new File (options.output)) | |
| 119 | 			} else {
 | |
| 120 | arglist.add(new File (outputFile)) | |
| 121 | } | |
| 122 | 			if (options.m) {
 | |
| 123 | sdi.processMaster(arglist[0], arglist[1]) | |
| 124 | 			} else {
 | |
| 125 | sdi.processSubcorpus(arglist[0], arglist[1]) | |
| 126 | } | |
| 127 | 		} else {
 | |
| 128 | println "Wrong number of arguments." | |
| 129 | println cli.usage() | |
| 130 | } | |
| 131 | } | |
| 132 |  | |
| 133 | 	def processMaster (File masterFile, File outputFile) {
 | |
| 134 | // Runs the transform on each subcorpus listed in a master file. | |
| 135 | // Resaves the master file. | |
| 136 | logger.log(Level.INFO, "Loading corpus master file") | |
| 137 | def masterFolder = '' | |
| 138 | def outputFolder = '' | |
| 139 | 		if (masterFile.getParent()) {
 | |
| 140 | // If not cwd | |
| 141 | masterFolder = masterFile.getParent() | |
| 142 | } | |
| 143 | 		if (outputFile.getParent()) {
 | |
| 144 | // If not cwd | |
| 145 | outputFolder = outputFile.getParent() | |
| 146 | } | |
| 147 | def master = new XmlParser().parse(masterFile) | |
| 148 | // Process each subcorpus file | |
| 149 | 		master.body.subcorpus.each {
 | |
| 150 | processSubcorpus( | |
| 151 | new File (masterFolder, it.'@external'[5..-1]), | |
| 152 | 					new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml")
 | |
| 153 | ) | |
| 154 | 			it.'@external' = "${it.'@external'[0..-5]}_pos.xml"
 | |
| 155 | } | |
| 156 | 		logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}")
 | |
| 157 |  | |
| 158 | 		outputFile.withWriter { writer ->
 | |
| 159 | writer << groovy.xml.XmlUtil.serialize(master) | |
| 160 | } | |
| 161 | logger.log(Level.INFO, "Run reheader on this file!") | |
| 162 | } | |
| 163 |  | |
| 164 | 	def processSubcorpus (File subcorpusFile, File outputFile) {
 | |
| 165 | // Parse Subcorpus file | |
| 166 | 		logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}")
 | |
| 167 | def TS = new XmlParser().parse(subcorpusFile) | |
| 168 | // This is used as a stack | |
| 169 | def tNodes = TS.s.graph.terminals.t | |
| 170 | // This isn't | |
| 171 | def tNodesFixed = TS.s.graph.terminals.t | |
| 172 |  | |
| 173 | // Build duplicata correspondance list | |
| 174 | logger.log(Level.INFO, "Building duplicata correspondance list") | |
| 175 | 		def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl ->
 | |
| 176 | def duplId = dupl.'@id' | |
| 177 | 			def realId = dupl.parent().parent().nonterminals.nt.edge.find {
 | |
| 178 | it.'@idref' == duplId | |
| 179 | 			}.parent().secedge.find {
 | |
| 180 | it.'@label' == 'dupl' | |
| 181 | }.'@idref' | |
| 182 | [duplId, realId] | |
| 183 | } | |
| 184 | 		logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.")
 | |
| 185 |  | |
| 186 | // Add pos & form tags | |
| 187 | logger.log(Level.INFO, "Adding PoS and form tags to each t-node") | |
| 188 | // Get first wNode corresponding to a tNode | |
| 189 | def i = 0 | |
| 190 | def wIx = null | |
| 191 | 		while (i < tNodes.size()) {
 | |
| 192 | // Avoid trying to find "fake" first words, it's very inefficient | |
| 193 | 			if (tNodes[i].'@word' != '#') {
 | |
| 194 | 				wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) })
 | |
| 195 | } | |
| 196 | 			if (wIx) {
 | |
| 197 | break | |
| 198 | 			} else {
 | |
| 199 | i++ | |
| 200 | } | |
| 201 | } | |
| 202 | assert wIx, "$i" | |
| 203 | /* | |
| 204 | * MAIN ITERATOR BEGINS HERE FOR POS INJECTION | |
| 205 | */ | |
| 206 | // Iterate over tNodes. | |
| 207 | 		while (tNodes) {
 | |
| 208 | def thisTNode = tNodes.remove(0) | |
| 209 | def thisTNodeId = thisTNode.'@editionId' | |
| 210 | def wNode = null | |
| 211 | def q = false | |
| 212 | // Define the PoS / q / form injecting closure | |
| 213 | 			def injectPos = {
 | |
| 214 | // Q tag | |
| 215 | 				if (q) {
 | |
| 216 | thisTNode.'@q' = 'y' | |
| 217 | 				} else {
 | |
| 218 | thisTNode.'@q' = 'n' | |
| 219 | } | |
| 220 | // Form tag | |
| 221 | 				if (textForm == 'prose') {
 | |
| 222 | thisTNode.'@form' = 'prose' | |
| 223 | } else if ( | |
| 224 | textForm == 'vers' || | |
| 225 | ( | |
| 226 | textForm == 'mixte' | |
| 227 | && wNode.parent().parent().name().getLocalPart() == 'ab' | |
| 228 | ) | |
| 229 | 				) {
 | |
| 230 | 					if (! wNode.'@lb') {
 | |
| 231 | thisTNode.'@form' = 'vers' | |
| 232 | 					} else if (wNode.'@lb' == 'start') {
 | |
| 233 | thisTNode.'@form' = 'vers_debut' | |
| 234 | 					} else if (wNode.'@lb' == 'end')  {
 | |
| 235 | thisTNode.'@form' = 'vers_fin' | |
| 236 | } | |
| 237 | } | |
| 238 | // DEBUG TMR 22/08/2012 | |
| 239 | 				else {
 | |
| 240 | thisTNode.'@form' = 'prose' | |
| 241 | } | |
| 242 | // END DEBUG TMR 22/08/2012 | |
| 243 | // Find PoS tag in BFM file | |
| 244 | // Try txm:ana tags | |
| 245 | def txmAnaList = wNode[nsTxm.ana] | |
| 246 | // Then interp tags | |
| 247 | 				if (! txmAnaList) {
 | |
| 248 | txmAnaList = wNode.interp | |
| 249 | } | |
| 250 | // Try type="#pos_syn" | |
| 251 | 				if (txmAnaList.find {it.'@type' == '#pos_syn'}) {
 | |
| 252 | 					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text()
 | |
| 253 | // type attr of wNode | |
| 254 | 				} else if (! txmAnaList && wNode.'@type') {
 | |
| 255 | thisTNode.'@pos' = wNode.'@type' | |
| 256 | // type="#pos" | |
| 257 | 				} else if (txmAnaList.find { it.'@type' =  '#pos' }) {
 | |
| 258 | 					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text()
 | |
| 259 | // type="#fropos" | |
| 260 | 				} else if (txmAnaList.find { it.'@type' =  '#fropos' }) {
 | |
| 261 | 					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text()
 | |
| 262 | 				} else {
 | |
| 263 | thisTNode.'@pos' = '--' | |
| 264 | } | |
| 265 | } | |
| 266 | // End of injecting closure. | |
| 267 | // Check whether wNode matches tNode | |
| 268 | 			if (thisTNode.'@word' == '#') {
 | |
| 269 | // Not expecting to find these in BFM file, all OK. | |
| 270 | thisTNode.'@pos' = '--' | |
| 271 | thisTNode.'@form' = '--' | |
| 272 | thisTNode.'@q' = '--' | |
| 273 | 			} else if ( idmatch(thisTNode, wNodes[wIx]) ) {
 | |
| 274 | // Perfect match | |
| 275 | wNode = wNodes[wIx] | |
| 276 | q = wNodesInQ.contains(wNode) | |
| 277 | injectPos.call() | |
| 278 | wIx++ | |
| 279 | 			} else if (thisTNode.'@word' == '*') {
 | |
| 280 | // Duplicata, use dupl match list to find correct wNode | |
| 281 | 				def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' }
 | |
| 282 | 				if (duplIdMatch) {
 | |
| 283 | 					def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] }
 | |
| 284 | // To enhance performance, I assume initially that the wNode matching the | |
| 285 | // duplicata is within +- 10 words of the current position in the | |
| 286 | // list. The wNodes list is very long, and searches over the entire | |
| 287 | // list are avoided. | |
| 288 | assert matchTNode, "$duplIdMatch" | |
| 289 | 					if (wIx + 10 < wNodes.size() && wIx - 10 > 0) {
 | |
| 290 | 						wNode = wNodes[wIx-10 .. wIx+10].find {
 | |
| 291 | idmatch(matchTNode, it) | |
| 292 | } | |
| 293 | } | |
| 294 | // Use whole list if it failed. | |
| 295 | 					if (! wNode) {
 | |
| 296 | 						wNode = wNodes.find {
 | |
| 297 | idmatch(matchTNode, it) | |
| 298 | } | |
| 299 | } | |
| 300 | q = wNodesInQ.contains(wNode) | |
| 301 | injectPos.call() | |
| 302 | // Don't update wIx value! | |
| 303 | 				} else {
 | |
| 304 | thisTNode.'@pos' = '--' | |
| 305 | thisTNode.'@form' = '--' | |
| 306 | thisTNode.'@q' = '--' | |
| 307 | logger.log( | |
| 308 | Level.WARNING, | |
| 309 | "<w/> node matching TS duplicata $thisTNodeId not found." | |
| 310 | ) | |
| 311 | } | |
| 312 | 			}  else if (punctTest(wNodes[wIx])) {
 | |
| 313 | // not expecting to find these in TS file; try same tNode again. | |
| 314 | tNodes.add(0, thisTNode) | |
| 315 | wIx++ | |
| 316 | } else if (! | |
| 317 | 			wNodes.find {
 | |
| 318 | idmatch (thisTNode, it) | |
| 319 | } | |
| 320 | 			) {
 | |
| 321 | // Check the SRCMF corpus hasn't acquired an extra word | |
| 322 | logger.log( | |
| 323 | Level.WARNING, | |
| 324 | 						"TS word $thisTNodeId (${thisTNode.'@word'}) is " +
 | |
| 325 | "absent from BFM file." | |
| 326 | ) | |
| 327 | thisTNode.'@pos' = '--' | |
| 328 | thisTNode.'@form' = '--' | |
| 329 | thisTNode.'@q' = '--' | |
| 330 | 			} else {
 | |
| 331 | // it's doesn't match, it's not a duplicata, | |
| 332 | // it's not BFM punctuation, but it IS found somewhere in the BFM | |
| 333 | // corpus. | |
| 334 | logger.log( | |
| 335 | Level.WARNING, | |
| 336 | "Discontinuity in TS corpus before word $thisTNodeId " + | |
| 337 | 						"(${thisTNode.'@word'})."
 | |
| 338 | ) | |
| 339 | 				wNode = wNodes.find {
 | |
| 340 | idmatch(thisTNode, it) | |
| 341 | } | |
| 342 | q = wNodesInQ.contains(wNode) | |
| 343 | injectPos.call() | |
| 344 | wIx = wNodes.indexOf(wNode) + 1 | |
| 345 | } | |
| 346 | } | |
| 347 | /* | |
| 348 | * End Main iteration | |
| 349 | */ | |
| 350 | /******************* | |
| 351 | * Recalculate headpos | |
| 352 | */ | |
| 353 | logger.log(Level.INFO, "Recalculating headpos.") | |
| 354 | 		TS.s.graph.nonterminals.nt.each { nt ->
 | |
| 355 | 			def a = nt.edge.findAll { it.'@label' == 'L' }
 | |
| 356 | 			if (a.size() == 0) {
 | |
| 357 | nt.'@headpos' = '--' | |
| 358 | 			} else if (a.size() == 1) {
 | |
| 359 | 				nt.'@headpos' = nt.parent().parent().terminals.t.find {
 | |
| 360 | it.'@id' == a[0].'@idref' | |
| 361 | }.'@pos' | |
| 362 | 			} else {
 | |
| 363 | 				for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) {
 | |
| 364 | if ( | |
| 365 | 					a.each { edge ->
 | |
| 366 | 						nt.parent().parent().terminals.t.find {
 | |
| 367 | it.'@id' == edge.'@idref' | |
| 368 | }.'@pos' | |
| 369 | }.contains(headpos) | |
| 370 | 					) {
 | |
| 371 | nt.'@headpos' = headpos + '?' | |
| 372 | break | |
| 373 | } | |
| 374 | } | |
| 375 | } | |
| 376 | assert nt.'@headpos' | |
| 377 | } | |
| 378 | 		logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}")
 | |
| 379 |  | |
| 380 | 		outputFile.withWriter { writer ->
 | |
| 381 | writer << groovy.xml.XmlUtil.serialize(TS) | |
| 382 | } | |
| 383 | } | |
| 384 | // Subroutine self-test to establish whether a Groovy node is punctuation | |
| 385 | 	private punctTest(gNode) {
 | |
| 386 | 		if (gNode.'@type') {
 | |
| 387 | 			gNode.'@type'.toLowerCase().startsWith('pon')
 | |
| 388 | 		} else {
 | |
| 389 | 			if (gNode[nsTxm.ana]) {
 | |
| 390 | 				return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null
 | |
| 391 | 			} else if (gNode.interp) {
 | |
| 392 | 				return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null
 | |
| 393 | } else return false | |
| 394 | } | |
| 395 | } | |
| 396 |  | |
| 397 | 	private idmatch (tNode, wNode) {
 | |
| 398 | // Matches a TS node with a <w/> node | |
| 399 | def idMatch = false | |
| 400 | def wNodeId = wNode.'@id' | |
| 401 | 		if (! wNodeId) {
 | |
| 402 | wNodeId = wNode.attribute(nsXml.id) | |
| 403 | } | |
| 404 | def tNodeId = tNode.'@editionId' | |
| 405 | def regex = /(\d+)([_a-z]+)?$/ | |
| 406 | def m1 = (tNodeId =~ regex) | |
| 407 | def m2 = (wNodeId =~ regex) | |
| 408 | 		if ( m1.find() && m2.find() ) {
 | |
| 409 | idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger()) | |
| 410 | 		} else {
 | |
| 411 | logger.log( | |
| 412 | Level.WARNING, | |
| 413 | "Idmatch routine can't find integer part of one or both of the" + | |
| 414 | "paired Ids for nodes $tNode, $wNode" | |
| 415 | ) | |
| 416 | } | |
| 417 | 		if (! idMatch) {
 | |
| 418 | return false | |
| 419 | } | |
| 420 |  | |
| 421 | //def wNodeForm = wNode.text() // Enable if treating old-style XML-BFM | |
| 422 | def wNodeForm = ""; | |
| 423 | 		if (! wNodeForm) {
 | |
| 424 | wNodeForm = getDeepText(wNode[nsTxm.form][0]) | |
| 425 | } | |
| 426 | def tNodeForm = tNode.'@word' | |
| 427 | def formMatch = (tNodeForm == wNodeForm) | |
| 428 | 		if (idMatch && formMatch) {
 | |
| 429 | return true | |
| 430 | 		} else {
 | |
| 431 | logger.log(Level.WARNING, | |
| 432 | "Matching IDs ($tNodeId) but unmatched forms: TS form '$tNodeForm', BFM form '$wNodeForm'.") | |
| 433 | return true | |
| 434 | } | |
| 435 | } | |
| 436 |  | |
| 437 | 	def getDeepText(def node) {
 | |
| 438 | def str = "" | |
| 439 | 		node.children().each { child ->
 | |
| 440 | 			if (child instanceof Node) {
 | |
| 441 | str += getDeepText(child) | |
| 442 | 			} else if (child instanceof String) {
 | |
| 443 | str += child | |
| 444 | } | |
| 445 | } | |
| 446 | return str | |
| 447 | } | |
| 448 | } | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/PunctInjectImport.groovy (revision 1000) | ||
|---|---|---|
| 1 | // Script to restore punctuation etc. to results of TS query. | |
| 2 | // Inputs: | |
| 3 | // --- Tiger-XML Document node | |
| 4 | // --- Java array: | |
| 5 | // --- --- String [index][type] where: | |
| 6 | // --- --- --- type == 0 gives the xml:id | |
| 7 | // --- --- --- type == 1 gives the word form | |
| 8 | // Process: | |
| 9 | // --- Injects punctuation. | |
| 10 | // Returns: | |
| 11 | // --- Tiger-XML Document node. | |
| 12 |  | |
| 13 | // TXM package statement | |
| 14 | package org.txm.scripts.importer.srcmf | |
| 15 |  | |
| 16 | import javax.xml.parsers.DocumentBuilderFactory | |
| 17 |  | |
| 18 |  | |
| 19 |  | |
| 20 | /** | |
| 21 | * | |
| 22 | * @author tmr | |
| 23 | * | |
| 24 | */ | |
| 25 | class PunctInjectImport | |
| 26 | {
 | |
| 27 |  | |
| 28 | 	static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
 | |
| 29 | 	static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei')
 | |
| 30 | def tigerXml = null | |
| 31 | def txmIdWordTableFixed = [] | |
| 32 | def outputFile = null | |
| 33 |  | |
| 34 | /* | |
| 35 | * Call this constructor when no need of XML-TXM file for word ID table | |
| 36 | */ | |
| 37 | 	PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) {
 | |
| 38 | def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() | |
| 39 | tigerXml = builder.parse(tigerXmlFile).documentElement | |
| 40 | txmIdWordTableFixed = txmIdWordTable | |
| 41 | 		outputFile = new File("output.xml")
 | |
| 42 | } | |
| 43 |  | |
| 44 | /* | |
| 45 | * Call this constructor when the XML-TXM file is needed for the word list. | |
| 46 | */ | |
| 47 | 	PunctInjectImport(File tigerXmlFile, File xmlTxmFile) {
 | |
| 48 | // | |
| 49 | def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() | |
| 50 | tigerXml = builder.parse(tigerXmlFile).documentElement | |
| 51 | txmIdWordTableFixed = [] | |
| 52 | def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile) | |
| 53 | 		for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) {
 | |
| 54 | def anId = null | |
| 55 | anId = w.'@id' | |
| 56 | 			if (! anId) {
 | |
| 57 | anId = w.attribute(nsXml.id) | |
| 58 | } | |
| 59 | 			if (! anId) {
 | |
| 60 | println "Error: ID attribute not located in XML TXM file" | |
| 61 | } | |
| 62 | txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])] | |
| 63 | } | |
| 64 | // assert txmIdWordTableFixed.size() > 0 | |
| 65 | 		outputFile = new File("output.xml")
 | |
| 66 | } | |
| 67 |  | |
| 68 | 	def process () {
 | |
| 69 | def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed) | |
| 70 | 		def allTs = toList(this.tigerXml.getElementsByTagName('t'))
 | |
| 71 | def tAttrs = getTAttrNames(allTs[0]) | |
| 72 | 		def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first()
 | |
| 73 | def addedWordIds = [] | |
| 74 | int tIx = 0 | |
| 75 | 		while (txmIdWordTable) {
 | |
| 76 | def word = txmIdWordTable.remove(0) | |
| 77 | 			if (tIx == allTs.size()) {
 | |
| 78 | // End of TS file, but still words left in the BFM file. | |
| 79 | addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append') | |
| 80 | } | |
| 81 | 			else {
 | |
| 82 | def tNode = allTs[tIx] | |
| 83 | def tId = getTNodeId(tNode) | |
| 84 | 				if (tId == word[0] && tNode.getAttribute('word') == word[1]) {
 | |
| 85 | // alles gut | |
| 86 | tIx += 1 | |
| 87 | } | |
| 88 | 				else if (tId == word[0]) {
 | |
| 89 | 					println("Mismatched Ids! ($tId)")
 | |
| 90 | tIx += 1 | |
| 91 | } | |
| 92 | 				else if (['#', '*'].contains(tNode.getAttribute('word')) ) {
 | |
| 93 | // SRCMF duplicata; try comparing word against the next tNode next time | |
| 94 | // around. | |
| 95 | txmIdWordTable.add(0, word) | |
| 96 | tIx += 1 | |
| 97 | } | |
| 98 | // Check that the SRCMF corpus doesn't have a bug in it... | |
| 99 | 				else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
 | |
| 100 | 				&& (allTs[0..tIx - 1].find{
 | |
| 101 | 					it.getAttribute('id') == "$tNodeIdPrefix#$tId"
 | |
| 102 | 				})) {
 | |
| 103 | 					println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
 | |
| 104 | txmIdWordTable.add(0, word) | |
| 105 | tIx += 1 | |
| 106 | } | |
| 107 | // Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change) | |
| 108 | 				else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
 | |
| 109 | 				&& (allTs[tIx..-1].find{
 | |
| 110 | 					it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}"
 | |
| 111 | 				})) {
 | |
| 112 | 					println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
 | |
| 113 | txmIdWordTable.add(0, word) | |
| 114 | tIx += 1 | |
| 115 | } | |
| 116 | 				else if (addedWordIds.contains(tId)) {
 | |
| 117 | 					println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
 | |
| 118 | txmIdWordTable.add(0, word) | |
| 119 | tIx += 1 | |
| 120 | } | |
| 121 | 				else {
 | |
| 122 | // Insert word. In the first instance, it will have the same parent as | |
| 123 | // the tNode before which it's being inserted. | |
| 124 | addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before') | |
| 125 | addedWordIds.add(word[0]) | |
| 126 | } | |
| 127 | } | |
| 128 | } | |
| 129 | // Second phase: move punctuation into previous sentence, | |
| 130 | // dependent on sequence. | |
| 131 | 		def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals'))
 | |
| 132 | 		for (def i = 1 ; i < allTerminalses.size() ; i++) {
 | |
| 133 | 			def ts = toList(allTerminalses[i].getElementsByTagName('t'))
 | |
| 134 | def startPunc = true | |
| 135 | def puncStack = [] | |
| 136 | 			while (ts && startPunc) {
 | |
| 137 | 				if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) {
 | |
| 138 | puncStack.add(ts.remove(0)) | |
| 139 | } | |
| 140 | 				else {
 | |
| 141 | startPunc = false | |
| 142 | } | |
| 143 | } | |
| 144 | // Now, treat the punctuation stack at the beginning of the sentence | |
| 145 | 			if ( puncStack ) {
 | |
| 146 | int moveLeft = 0 | |
| 147 | // First, identify LAST instance of sentence-final punctuation. | |
| 148 | 				def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('')
 | |
| 149 | def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/ | |
| 150 | 				if (matches.size() > 0) {
 | |
| 151 | moveLeft = puncString.lastIndexOf(matches[-1]) + 1 | |
| 152 | } | |
| 153 | // Second, split pairs of straight quotes | |
| 154 | 				matches = puncString =~ /(""|'')/ //"
 | |
| 155 | 				if (matches.size() > 0) {
 | |
| 156 | moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max() | |
| 157 | } | |
| 158 | // Now, move moveLeft punctuation nodes to the end of the prev. sentence | |
| 159 | 				ts = toList(allTerminalses[i].getElementsByTagName('t'))
 | |
| 160 | 				for (def j = 0 ; j < moveLeft ; j++ ) {
 | |
| 161 | allTerminalses[i - 1].appendChild(ts[j]) | |
| 162 | } | |
| 163 | } | |
| 164 | } | |
| 165 | 		outputFile.withWriter("UTF-8") { writer ->
 | |
| 166 | writer.println(this.tigerXml) | |
| 167 | } | |
| 168 | } | |
| 169 |  | |
| 170 | 	private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
 | |
| 171 | 		def newTNode = tNode.getOwnerDocument().createElement('t')
 | |
| 172 | 		for (def anAttr : tAttrs) {
 | |
| 173 | 			if (anAttr == 'id') {
 | |
| 174 | 				newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}")
 | |
| 175 | } | |
| 176 | 			else if (anAttr == 'word') {
 | |
| 177 | 				newTNode.setAttribute('word', word[1])
 | |
| 178 | } | |
| 179 | 			else {
 | |
| 180 | newTNode.setAttribute(anAttr, '--') | |
| 181 | } | |
| 182 | } | |
| 183 | 		if (where == 'before') {
 | |
| 184 | tNode.getParentNode().insertBefore(newTNode, tNode) | |
| 185 | } | |
| 186 | 		else if (where == 'append') {
 | |
| 187 | tNode.getParentNode().appendChild(newTNode) | |
| 188 | } | |
| 189 | 		else {
 | |
| 190 | 			throw new IllegalArgumentException('Bad before value')
 | |
| 191 | } | |
| 192 | } | |
| 193 | 	def getTAttrNames(tNode) {
 | |
| 194 | def nodeMap = tNode.attributes | |
| 195 | def nameList = [] | |
| 196 | 		for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) {
 | |
| 197 | nameList.add( nodeMap.item(i).nodeName ) | |
| 198 | } | |
| 199 | return nameList | |
| 200 | } | |
| 201 | 	def getTNodeId(tNode) {
 | |
| 202 | 		return tNode.getAttribute('id').tokenize('#').last()
 | |
| 203 | } | |
| 204 | 	def toList(def iterable) {
 | |
| 205 | 		return iterable.findAll {true};
 | |
| 206 | } | |
| 207 | 	def getDeepText(def node) {
 | |
| 208 | def str = "" | |
| 209 | 		node.children().each { child ->
 | |
| 210 | 			if (child instanceof Node) {
 | |
| 211 | str += getDeepText(child) | |
| 212 | 			} else if (child instanceof String) {
 | |
| 213 | str += child | |
| 214 | } | |
| 215 | } | |
| 216 | return str | |
| 217 | } | |
| 218 |  | |
| 219 | public static void main(String[] args) | |
| 220 | 	{
 | |
| 221 | def cli = new CliBuilder( | |
| 222 | usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml' | |
| 223 | ) | |
| 224 | cli.h(longOpt:'help', 'Prints this message.') | |
| 225 | cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') | |
| 226 | def options = cli.parse(args) | |
| 227 | def tigerXmlFile = null | |
| 228 | def xmlTxmFile = null | |
| 229 | 		if (options.arguments().size() == 2) {
 | |
| 230 | tigerXmlFile = new File(options.arguments()[0]) | |
| 231 | xmlTxmFile = new File(options.arguments()[1]) | |
| 232 | 		} else {
 | |
| 233 | println 'Incorrect number of command line arguments... exiting' | |
| 234 | println cli.usage() | |
| 235 | System.exit(2) | |
| 236 | } | |
| 237 | def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile) | |
| 238 | 		if (options.o) {
 | |
| 239 | pii.outputFile = new File(options.o) | |
| 240 | } | |
| 241 | pii.process() | |
| 242 | } | |
| 243 | } | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/srcmf/srcmfLoader.groovy (revision 1000) | ||
|---|---|---|
| 1 | package org.txm.scripts.importer.srcmf | |
| 2 |  | |
| 3 | // srcdir | |
| 4 | // txml/master.xml | |
| 5 | // txml/*.xml | |
| 6 | // txm/*.xml | |
| 7 |  | |
| 8 | // 1 import xml-txm | |
| 9 | import ims.tiger.gui.tigerregistry.TIGERRegistry; | |
| 10 | import ims.tiger.index.writer.IndexBuilderErrorHandler; | |
| 11 | import ims.tiger.index.writer.SimpleErrorHandler; | |
| 12 | import ims.tiger.index.writer.XMLIndexing; | |
| 13 | import ims.tiger.system.*; | |
| 14 |  | |
| 15 | import javax.xml.stream.*; | |
| 16 |  | |
| 17 | import java.io.BufferedWriter; | |
| 18 | import java.io.File; | |
| 19 | import java.io.FileOutputStream; | |
| 20 | import java.io.IOException; | |
| 21 | import java.io.OutputStreamWriter; | |
| 22 | import java.io.PrintStream; | |
| 23 | import java.net.URL; | |
| 24 | import java.util.HashMap; | |
| 25 | import java.util.logging.FileHandler | |
| 26 | import java.io.File; | |
| 27 |  | |
| 28 | import org.apache.log4j.BasicConfigurator; | |
| 29 | import org.txm.export.ts.*; | |
| 30 | import org.txm.importer.ApplyXsl2; | |
| 31 | import org.txm.utils.xml.DomUtils; | |
| 32 | import org.txm.importer.ValidateXml; | |
| 33 | import org.txm.importer.XPathResult; | |
| 34 | import org.txm.importer.xmltxm.compiler; | |
| 35 | import org.txm.importer.xml.pager; | |
| 36 | import org.txm.objects.*; | |
| 37 | import org.txm.utils.* | |
| 38 | import org.txm.utils.io.*; | |
| 39 | import org.txm.*; | |
| 40 | import org.txm.importer.xmltxm.*; | |
| 41 | import org.txm.metadatas.*; | |
| 42 | import org.txm.utils.i18n.*; | |
| 43 | import org.w3c.dom.Element | |
| 44 | import org.xml.sax.SAXException; | |
| 45 |  | |
| 46 | import javax.xml.parsers.DocumentBuilderFactory | |
| 47 |  | |
| 48 | String userDir = System.getProperty("user.home");
 | |
| 49 | boolean debug = false | |
| 50 | def MONITOR; | |
| 51 | BaseParameters params; | |
| 52 | try {params = paramsBinding;MONITOR=monitor} catch (Exception)
 | |
| 53 | {	println "DEV MODE";//exception means we debug
 | |
| 54 | debug = true | |
| 55 | params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml")) | |
| 56 | params.load() | |
| 57 | 	if (!org.txm.Toolbox.isInitialized()) {
 | |
| 58 | 		Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
 | |
| 59 | Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger")); | |
| 60 | Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models")); | |
| 61 | Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8"); | |
| 62 | Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ","); | |
| 63 | Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\""); | |
| 64 | 		Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
 | |
| 65 | } | |
| 66 | } | |
| 67 | if (params == null) { println "no parameters. Aborting"; return; }
 | |
| 68 |  | |
| 69 | String corpusname = params.getCorpusName(); | |
| 70 | Element corpusElem = params.corpora.get(corpusname); | |
| 71 | String basename = params.name; | |
| 72 | String rootDir = params.rootDir; | |
| 73 | String lang = corpusElem.getAttribute("lang");
 | |
| 74 | String model = lang | |
| 75 | String encoding = corpusElem.getAttribute("encoding");
 | |
| 76 | boolean annotate = "true" == corpusElem.getAttribute("annotate");
 | |
| 77 | String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
 | |
| 78 | def xslParams = params.getXsltParams(corpusElem); | |
| 79 |  | |
| 80 | File txmSrcDir = new File(rootDir, "xml-txm"); | |
| 81 | File tigerSrcDir = new File(rootDir, "tiger-xml"); | |
| 82 | File headerFile = new File(rootDir, "tiger-xml/header.xml"); | |
| 83 |  | |
| 84 | File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename); | |
| 85 |  | |
| 86 | binDir.deleteDir(); | |
| 87 | binDir.mkdirs(); | |
| 88 | if (!binDir.exists()) {
 | |
| 89 | println "Could not create binDir "+binDir | |
| 90 | return; | |
| 91 | } | |
| 92 | if (!txmSrcDir.exists()) {
 | |
| 93 | println "Error: txm src dir does not exists: "+txmSrcDir | |
| 94 | return; | |
| 95 | } | |
| 96 | if (!tigerSrcDir.exists()) {
 | |
| 97 | println "Error: tiger src dir does not exists: "+tigerSrcDir | |
| 98 | return; | |
| 99 | } | |
| 100 | File txmDir = new File(binDir,"txm/$corpusname"); | |
| 101 | txmDir.deleteDir(); | |
| 102 | txmDir.mkdirs(); | |
| 103 | // copy txm files | |
| 104 | List<File> srcfiles = txmSrcDir.listFiles(); | |
| 105 | for (File f : srcfiles) {// check XML format, and copy file into binDir
 | |
| 106 | 	if (f.getName().equals("import.xml") || f.getName().matches("metadata\\.....?") || f.getName().endsWith(".properties"))
 | |
| 107 | continue; | |
| 108 | 	if (ValidateXml.test(f)) {
 | |
| 109 | FileCopy.copy(f, new File(txmDir, f.getName())); | |
| 110 | 	} else {
 | |
| 111 | println "Won't process file "+f; | |
| 112 | } | |
| 113 | } | |
| 114 | if (txmDir.listFiles() == null) {
 | |
| 115 | println "No txm file to process" | |
| 116 | return; | |
| 117 | } | |
| 118 |  | |
| 119 | List<File> filelist = txmDir.listFiles(); | |
| 120 | Collections.sort(filelist); | |
| 121 |  | |
| 122 | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
 | |
| 123 | if (MONITOR != null) MONITOR.worked(25, "COMPILING") | |
| 124 | println "-- COMPILING - Building Search Engine indexes" | |
| 125 | println "binDir: $binDir" | |
| 126 | println "txmDir: $txmDir" | |
| 127 | def c = new compiler(); | |
| 128 | if (debug) c.setDebug(); | |
| 129 | c.setLang(lang); | |
| 130 | //c.setSortMetadata(sortMetadata) | |
| 131 | if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
 | |
| 132 | println "import process stopped"; | |
| 133 | return; | |
| 134 | } | |
| 135 |  | |
| 136 | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
 | |
| 137 | if (MONITOR != null) MONITOR.worked(20, "EDITION") | |
| 138 | println "-- EDITION - Building edition" | |
| 139 | new File(binDir,"HTML/$corpusname").deleteDir(); | |
| 140 | new File(binDir,"HTML/$corpusname").mkdirs(); | |
| 141 | File outdir = new File(binDir,"/HTML/$corpusname/default/"); | |
| 142 | outdir.mkdirs(); | |
| 143 |  | |
| 144 |  | |
| 145 | def second = 0 | |
| 146 | println "Paginating text: " | |
| 147 | for (File srcfile : filelist) {
 | |
| 148 | String txtname = srcfile.getName(); | |
| 149 | 	int i = txtname.lastIndexOf(".");
 | |
| 150 | if(i > 0) txtname = txtname.substring(0, i); | |
| 151 | List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); | |
| 152 | List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); | |
| 153 | Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile); | |
| 154 | 	if (second) { print(", ") }
 | |
| 155 | if (second > 0 && (second++ % 5) == 0) println "" | |
| 156 | print(srcfile.getName()); | |
| 157 | def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb"); | |
| 158 | Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html"); | |
| 159 | 	for (i = 0 ; i < ed.getPageFiles().size();) {
 | |
| 160 | File f = ed.getPageFiles().get(i); | |
| 161 | String wordid = ed.getIdx().get(i); | |
| 162 | params.addPage(edition, ""+(++i), wordid); | |
| 163 | } | |
| 164 | } | |
| 165 |  | |
| 166 | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
 | |
| 167 | if (MONITOR != null) MONITOR.worked(20, "FINALIZING") | |
| 168 | File paramFile = new File(binDir, "import.xml"); | |
| 169 | DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true; | |
| 170 |  | |
| 171 |  | |
| 172 | // TMR Modifications start here | |
| 173 | File tigerDir = new File(binDir, "tiger"); | |
| 174 | tigerDir.mkdir(); | |
| 175 | File master = new File(tigerSrcDir, "master.xml"); | |
| 176 | File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file | |
| 177 |  | |
| 178 | // Run SRCMF cmd line corpus import script | |
| 179 | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
 | |
| 180 | def importer = new SrcmfImporter() | |
| 181 | def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir) | |
| 182 |  | |
| 183 | // Create TigerXml-POS-PNC | |
| 184 | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
 | |
| 185 | println "create TIGER XML POS PNC" | |
| 186 | PunctInjectImport injector; | |
| 187 | if (Toolbox.isInitialized()) {
 | |
| 188 | // get idx from CQP | |
| 189 | // INCOMPLETE | |
| 190 | // def injector = new PunctInjectImport(tmpFile, words) | |
| 191 | println "Error: the loader is not ready to be launched from RCP" | |
| 192 | return; | |
| 193 | } else {
 | |
| 194 | injector = new PunctInjectImport(tigerXmlAll, txmSrcFile) | |
| 195 | } | |
| 196 | injector.outputFile = new File(tigerDir, "TigerPnc.xml") | |
| 197 | injector.process() | |
| 198 |  | |
| 199 | // Create the tigersearch.logprop file (used to launch TIGERSearch) | |
| 200 | File logprop = new File(tigerDir, "tigersearch.logprop"); | |
| 201 | logprop.withWriter("UTF-8") { writer ->
 | |
| 202 | 	writer.write("""# Default log configuration of the TIGERSearch suite
 | |
| 203 |  | |
| 204 | log4j.rootLogger=WARN,Logfile | |
| 205 | log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO | |
| 206 |  | |
| 207 | log4j.appender.Logfile=org.apache.log4j.RollingFileAppender | |
| 208 | log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
 | |
| 209 | log4j.appender.Logfile.MaxFileSize=500KB | |
| 210 | log4j.appender.Logfile.MaxBackupIndex=1 | |
| 211 |  | |
| 212 | log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout | |
| 213 | log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")	
 | |
| 214 | } | |
| 215 |  | |
| 216 | // Run TigerRegistry | |
| 217 | if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
 | |
| 218 | BasicConfigurator.configure(); | |
| 219 | String uri = tigerXmlAll.getAbsolutePath(); | |
| 220 | File tigerBinDir = new File(tigerDir, corpusname) | |
| 221 | tigerBinDir.mkdir() | |
| 222 | try {
 | |
| 223 | IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()); | |
| 224 | XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false); | |
| 225 | indexing.startIndexing(); | |
| 226 | } | |
| 227 | catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
 | |
| 228 | catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
 | |
| 229 |  | |
| 230 | tigerXmlAll.delete() | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TigerXMLHeader.xsd (revision 1000) | ||
|---|---|---|
| 1 | <?xml version="1.0" encoding="UTF-8"?> | |
| 2 | <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> | |
| 3 |  | |
| 4 | <!-- ======================================================================= | |
| 5 | XML SubSchema for the header part of the TIGER-XML format | |
| 6 | http://www.ims.uni-stuttgart.de/projekte/TIGER/publicTigerXMLHeader.xsd | |
| 7 | ======================================================================= | |
| 8 | TIGER Project, Wolfgang Lezius | |
| 9 | IMS, University of Stuttgart, 04/01/2003 | |
| 10 | ======================================================================= --> | |
| 11 |  | |
| 12 |  | |
| 13 | <!-- ====================================================== | |
| 14 | DECLARATION OF THE HEADER | |
| 15 | ====================================================== --> | |
| 16 |  | |
| 17 |  | |
| 18 | <!-- declaration of the head element --> | |
| 19 |  | |
| 20 | <xs:element name="head" type="headType"/> | |
| 21 |  | |
| 22 |  | |
| 23 | <!-- declaration of the header type --> | |
| 24 |  | |
| 25 | <xs:complexType name="headType"> | |
| 26 |  | |
| 27 | <xs:sequence> | |
| 28 | <xs:element name="meta" type="metaType" minOccurs="0" maxOccurs="1"/> | |
| 29 | <xs:element name="annotation" type="annotationType" minOccurs="0" maxOccurs="1"/> | |
| 30 | <xs:/sequence> | |
| 31 |  | |
| 32 | <!-- optional: reference to external header file | |
| 33 |  | |
| 34 | The header of a TigerXML corpus can also be stored in separate file. | |
| 35 | This attribute points to the external header file. The pointer is | |
| 36 | an URI. Examples: file:relative.xml or file:/path/to/absolute.xml | |
| 37 |  | |
| 38 | Note: If there is a pointer to an external file, the head | |
| 39 | element must be empty. --> | |
| 40 |  | |
| 41 | <xs:attribute name="external" type="xsd:anyURI"/> | |
| 42 |  | |
| 43 | <xs:/complexType> | |
| 44 |  | |
| 45 |  | |
| 46 | <!-- declaration of the meta information type --> | |
| 47 |  | |
| 48 | <xs:complexType name="metaType"> | |
| 49 |  | |
| 50 | <xs:sequence> | |
| 51 | <xs:element name="name" type="xsd:string" minOccurs="0" maxOccurs="1"/> | |
| 52 | <xs:element name="author" type="xsd:string" minOccurs="0" maxOccurs="1"/> | |
| 53 | <xs:element name="date" type="xsd:string" minOccurs="0" maxOccurs="1"/> | |
| 54 | <xs:element name="description" type="xsd:string" minOccurs="0" maxOccurs="1"/> | |
| 55 | <xs:element name="format" type="xsd:string" minOccurs="0" maxOccurs="1"/> | |
| 56 | <xs:element name="history" type="xsd:string" minOccurs="0" maxOccurs="1"/> | |
| 57 | <xs:/sequence> | |
| 58 |  | |
| 59 | <xs:/complexType> | |
| 60 |  | |
| 61 |  | |
| 62 | <!-- declaration of the annotation type --> | |
| 63 |  | |
| 64 | <xs:complexType name="annotationType"> | |
| 65 |  | |
| 66 | <xs:sequence> | |
| 67 | <xs:element name="feature" type="featureType" minOccurs="1" maxOccurs="unbounded"/> | |
| 68 | <xs:element name="edgelabel" type="edgelabelType" minOccurs="0" maxOccurs="1"/> | |
| 69 | <xs:element name="secedgelabel" type="edgelabelType" minOccurs="0" maxOccurs="1"/> | |
| 70 | <xs:/sequence> | |
| 71 |  | |
| 72 | <xs:/complexType> | |
| 73 |  | |
| 74 |  | |
| 75 | <!-- declaration of the feature type --> | |
| 76 |  | |
| 77 | <xs:complexType name="featureType"> | |
| 78 |  | |
| 79 | <xs:sequence> | |
| 80 | <xs:element name="value" type="featurevalueType" minOccurs="0" maxOccurs="unbounded"/> | |
| 81 | <xs:/sequence> | |
| 82 |  | |
| 83 | <xs:attribute name="name" type="featurenameType" use="required"/> | |
| 84 |  | |
| 85 | <xs:attribute name="domain" use="required"> | |
| 86 | <xs:simpleType> | |
| 87 | <xs:restriction base="xsd:string"> | |
| 88 | <xs:enumeration value="T"/> <!-- feature for terminal nodes --> | |
| 89 | <xs:enumeration value="NT"/> <!-- feature for nonterminal nodes --> | |
| 90 | <xs:enumeration value="FREC"/> <!-- feature for both --> | |
| 91 | <xs:/restriction> | |
| 92 | <xs:/simpleType> | |
| 93 | <xs:/attribute> | |
| 94 |  | |
| 95 | <xs:/complexType> | |
| 96 |  | |
| 97 |  | |
| 98 | <!-- declaration of the (secondary) edge label type --> | |
| 99 |  | |
| 100 | <xs:complexType name="edgelabelType"> | |
| 101 |  | |
| 102 | <xs:sequence> | |
| 103 | <xs:element name="value" type="featurevalueType" minOccurs="0" maxOccurs="unbounded"/> | |
| 104 | <xs:/sequence> | |
| 105 |  | |
| 106 | <xs:/complexType> | |
| 107 |  | |
| 108 |  | |
| 109 | <!-- declaration of the feature value type --> | |
| 110 |  | |
| 111 | <xs:complexType name="featurevalueType"> | |
| 112 |  | |
| 113 | <xs:simpleContent> <!-- element content: documentation of the feature value --> | |
| 114 | <xs:extension base="xsd:string"> | |
| 115 | <xs:attribute name="name" type="xsd:string"/> | |
| 116 | <xs:/extension> | |
| 117 | <xs:/simpleContent> | |
| 118 |  | |
| 119 |  | |
| 120 | <xs:/complexType> | |
| 121 |  | |
| 122 |  | |
| 123 | <!-- ====================================================== | |
| 124 | HEADER DECLARATIONS THAT SHOULD BE REFINED | |
| 125 | ====================================================== --> | |
| 126 |  | |
| 127 | <!-- declaration of the FEATURE NAMES used in the corpus header; | |
| 128 | this type is unrestricted, but should be refined by a | |
| 129 | specialised, corpus-dependent schema --> | |
| 130 |  | |
| 131 | <xs:simpleType name="featurenameType"> | |
| 132 |  | |
| 133 | <xs:restriction base="xsd:string"> | |
| 134 | <xs:minLength value="1"/> | |
| 135 | <xs:maxLength value="20"/> | |
| 136 | <xs:whiteSpace value="preserve"/> | |
| 137 | <xs:/restriction> | |
| 138 |  | |
| 139 | <xs:/simpleType> | |
| 140 |  | |
| 141 |  | |
| 142 | <xs:/schema> | |
| 0 | 143 | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/scripts/importer/tigersearch/TSImport.groovy (revision 1000) | ||
|---|---|---|
| 1 | package org.txm.scripts.importer.tigersearch; | |
| 2 |  | |
| 3 | import java.io.File; | |
| 4 | import java.util.ArrayList; | |
| 5 |  | |
| 6 | import ims.tiger.index.writer.* | |
| 7 | import ims.tiger.system.* | |
| 8 |  | |
| 9 | import org.txm.Toolbox; | |
| 10 | import org.txm.importer.ApplyXsl2; | |
| 11 | import org.txm.importer.xtz.* | |
| 12 | import org.txm.objects.BaseParameters | |
| 13 | import org.txm.utils.BundleUtils; | |
| 14 | import org.txm.utils.io.FileCopy; | |
| 15 | import org.apache.log4j.BasicConfigurator; | |
| 16 | import org.txm.importer.xtz.* | |
| 17 | import org.txm.scripts.importer.xtz.* | |
| 18 |  | |
| 19 | class TSImport extends XTZImport {
 | |
| 20 |  | |
| 21 | 	public TSImport(BaseParameters params) {
 | |
| 22 | super(params); | |
| 23 | } | |
| 24 |  | |
| 25 | @Override | |
| 26 | 	public void init(BaseParameters p) {
 | |
| 27 | super.init(p); | |
| 28 |  | |
| 29 | importer = new TSImporter(this); // only to build metadata | |
| 30 | compiler = new XTZCompiler(this) | |
| 31 | annotater = null; // no annotater step to do | |
| 32 | pager = new XTZPager(this) | |
| 33 | } | |
| 34 |  | |
| 35 | /** | |
| 36 | * Do a XTZ Import then build the TIGERSearch indexes in the binary corpus "tiger" directory | |
| 37 | */ | |
| 38 | @Override | |
| 39 | 	public void start() throws InterruptedException {
 | |
| 40 | File tigerSrcDir = sourceDirectory | |
| 41 |  | |
| 42 | def xmlFiles = [] | |
| 43 | 		sourceDirectory.listFiles(new FileFilter() {
 | |
| 44 | 					boolean accept(File file) {
 | |
| 45 | if (file.isDirectory()) return false; | |
| 46 | if (file.isHidden()) return false; | |
| 47 | 						if (file.getName().equals("import.xml")) return false;
 | |
| 48 | 						if (!file.getName().endsWith(".xml")) return false;
 | |
| 49 |  | |
| 50 | xmlFiles << file | |
| 51 | } | |
| 52 | }); | |
| 53 |  | |
| 54 | xmlFiles.remove(new File(sourceDirectory, "import.xml")) | |
| 55 |  | |
| 56 | 		if (xmlFiles.size() == 0) {
 | |
| 57 | println "Error no XML file found in $sourceDirectory" | |
| 58 | isSuccessful = false; | |
| 59 | return; | |
| 60 | } | |
| 61 |  | |
| 62 | File master = xmlFiles[0]; | |
| 63 | println "Main TIGER XML file found: $master" | |
| 64 |  | |
| 65 | File tsXSLFile = new File(Toolbox.getTXMHOMEPATH(), "xsl/ts.xsl"); | |
| 66 | 		BundleUtils.copyFiles("TIGERSearchRCP", "src", "org/txm/importer/tigersearch", "ts.xsl", tsXSLFile.getParentFile());
 | |
| 67 |  | |
| 68 | File xmltxmSrcDir = new File(binaryDirectory, "src"); // output directory of the TS XSL transformation | |
| 69 | xmltxmSrcDir.mkdirs(); | |
| 70 | FileCopy.copy(master, new File(xmltxmSrcDir, master.getName())); | |
| 71 |  | |
| 72 | 		if (!ApplyXsl2.processImportSources(tsXSLFile, xmltxmSrcDir, xmltxmSrcDir)) {
 | |
| 73 | println "Error while applying TS XSL file to $tigerSrcDir" | |
| 74 | isSuccessful = false; | |
| 75 | return; | |
| 76 | } | |
| 77 |  | |
| 78 | File[] files = xmltxmSrcDir.listFiles(); | |
| 79 | 		if (files == null || files.length == 0) {
 | |
| 80 | println "Error while applying TS XSL file to $xmltxmSrcDir is empty" | |
| 81 | isSuccessful = false; | |
| 82 | return; | |
| 83 | } | |
| 84 |  | |
| 85 | sourceDirectory = xmltxmSrcDir; // hop | |
| 86 | File txmDir = new File(binaryDirectory, "txm/"+corpusName); | |
| 87 | txmDir.mkdirs(); | |
| 88 | FileCopy.copyFiles(sourceDirectory, txmDir) // the compiler step will use these files | |
| 89 |  | |
| 90 | super.start(); // call the usual XTZ import | |
| 91 |  | |
| 92 | 		if (isSuccessful) {
 | |
| 93 |  | |
| 94 | File tigerDir = new File(binaryDirectory, "tiger"); | |
| 95 | tigerDir.mkdir(); | |
| 96 |  | |
| 97 |  | |
| 98 | File logprop = new File(tigerDir, "tigersearch.logprop"); | |
| 99 |  | |
| 100 | 			logprop.withWriter("UTF-8") { writer ->
 | |
| 101 | 				writer.write("""# Default log configuration of the TIGERSearch suite
 | |
| 102 | log4j.rootLogger=WARN,Logfile | |
| 103 | log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=WARNING | |
| 104 | log4j.appender.Logfile=org.apache.log4j.RollingFileAppender | |
Formats disponibles : Unified diff