Révision 911

tmp/org.txm.para.rcp/.classpath (revision 911)
1 1
<?xml version="1.0" encoding="UTF-8"?>
2 2
<classpath>
3 3
	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
4
	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
4
	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins">
5
		<accessrules>
6
			<accessrule kind="accessible" pattern="**"/>
7
		</accessrules>
8
	</classpathentry>
5 9
	<classpathentry kind="src" path="src"/>
6 10
	<classpathentry kind="output" path="bin"/>
7 11
</classpath>
tmp/org.txm.para.rcp/META-INF/MANIFEST.MF (revision 911)
1 1
Manifest-Version: 1.0
2
Require-Bundle: org.txm.utils;bundle-version="1.0.0";visibility:=reexp
3
 ort,org.eclipse.core.runtime;bundle-version="3.10.0";visibility:=reex
4
 port,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,or
5
 g.eclipse.ui;visibility:=reexport,org.txm.concordance.rcp;visibility:
6
 =reexport,org.txm.searchengine.cqp.core;bundle-version="1.1.0";visibi
7
 lity:=reexport,org.txm.rcp;bundle-version="0.7.8";visibility:=reexpor
8
 t,org.txm.core;bundle-version="0.7.0";visibility:=reexport,org.txm.pa
9
 ra.core;visibility:=reexport,org.txm.searchengine.core;bundle-version
10
 ="1.0.0";visibility:=reexport
2
Require-Bundle: org.txm.concordance.rcp;visibility:=reexport,
3
 org.txm.para.core;visibility:=reexport
11 4
Export-Package: org.txm.para.rcp.editors,
12 5
 org.txm.para.rcp.handlers,
13 6
 org.txm.para.rcp.messages
tmp/org.txm.textsbalance.feature/feature.xml (revision 911)
65 65
   </license>
66 66

  
67 67
   <requires>
68
      <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
69
      <import plugin="org.txm.searchengine.core" version="1.0.0" match="greaterOrEqual"/>
70
      <import plugin="org.txm.rcp" version="0.7.7" match="greaterOrEqual"/>
71
      <import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/>
72
      <import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/>
73
      <import plugin="org.eclipse.jface.text"/>
74
      <import plugin="org.eclipse.ui.editors"/>
75
      <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
76
      <import plugin="org.txm.chartsengine.r.core"/>
77
      <import plugin="org.eclipse.core.expressions" version="3.4.600" match="greaterOrEqual"/>
78 68
      <import plugin="org.txm.textsbalance.core" version="1.0.0" match="greaterOrEqual"/>
79
      <import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/>
80 69
      <import plugin="org.txm.chartsengine.rcp" version="1.0.0" match="greaterOrEqual"/>
70
      <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
71
      <import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/>
72
      <import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
81 73
   </requires>
82 74

  
83 75
   <plugin
tmp/org.txm.para.core/.classpath (revision 911)
1 1
<?xml version="1.0" encoding="UTF-8"?>
2 2
<classpath>
3 3
	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
4
	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
4
	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins">
5
		<accessrules>
6
			<accessrule kind="accessible" pattern="**"/>
7
		</accessrules>
8
	</classpathentry>
5 9
	<classpathentry kind="src" path="src"/>
6 10
	<classpathentry kind="output" path="bin"/>
7 11
</classpath>
tmp/org.txm.para.core/META-INF/MANIFEST.MF (revision 911)
1 1
Manifest-Version: 1.0
2
Require-Bundle: org.txm.libs.groovy-all;bundle-version="2.3.3";visibil
3
 ity:=reexport,org.txm.utils;bundle-version="1.0.0";visibility:=reexpo
4
 rt,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,org.
5
 eclipse.core.runtime;bundle-version="3.10.0";visibility:=reexport,org
6
 .txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexpor
7
 t,org.txm.core;bundle-version="0.7.0";visibility:=reexport,org.eclips
8
 e.ui;visibility:=reexport,org.txm.searchengine.core;bundle-version="1
9
 .0.0";visibility:=reexport
2
Require-Bundle: org.txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexport
10 3
Export-Package: org.txm.para.core.functions
11 4
Bundle-ActivationPolicy: lazy
12 5
Bundle-Version: 1.0.0.qualifier
tmp/org.txm.treetagger.files.feature/feature.xml (revision 911)
69 69
   </license>
70 70

  
71 71
   <requires>
72
      <import plugin="org.txm.utils"/>
73
      <import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/>
74 72
      <import plugin="org.txm.core" version="0.8.0" match="greaterOrEqual"/>
75 73
      <import plugin="org.txm.treetagger.core" version="1.0.0" match="greaterOrEqual"/>
76
      <import plugin="org.eclipse.ui"/>
77
      <import plugin="org.eclipse.swt"/>
78
      <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
79 74
      <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
80
      <import plugin="org.txm.libs.groovy-all" version="2.3.3" match="greaterOrEqual"/>
81
      <import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/>
82 75
   </requires>
83 76

  
84 77
   <plugin
tmp/org.txm.analec.rcp/src/org/txm/analec/imports/AnalecAnnotationsImporter.java (revision 911)
18 18
import org.apache.commons.lang.StringUtils;
19 19
import org.eclipse.core.runtime.IProgressMonitor;
20 20
import org.txm.Toolbox;
21
import org.txm.importer.graal.PersonalNamespaceContext;
21
import org.txm.importer.PersonalNamespaceContext;
22 22
import org.txm.searchengine.cqp.AbstractCqiClient;
23 23
import org.txm.searchengine.cqp.CQPSearchEngine;
24 24
import org.txm.searchengine.cqp.corpus.MainCorpus;
tmp/org.txm.analec.rcp/src/org/txm/analec/imports/DOMAnalecAnnotationsImporter.java (revision 911)
8 8

  
9 9
import org.eclipse.core.runtime.IProgressMonitor;
10 10
import org.txm.Toolbox;
11
import org.txm.importer.graal.PersonalNamespaceContext;
11
import org.txm.importer.PersonalNamespaceContext;
12 12
import org.txm.searchengine.cqp.AbstractCqiClient;
13 13
import org.txm.searchengine.cqp.CQPSearchEngine;
14 14
import org.txm.searchengine.cqp.corpus.MainCorpus;
tmp/org.txm.analec.rcp/src/org/txm/analec/export/AnalecAnnotationTEIExporter.java (revision 911)
17 17
import org.txm.Toolbox;
18 18
import org.txm.importer.StaxIdentityParser;
19 19
import org.txm.importer.StaxStackWriter;
20
import org.txm.importer.graal.PersonalNamespaceContext;
20
import org.txm.importer.PersonalNamespaceContext;
21 21
import org.txm.objects.BaseParameters;
22 22
import org.txm.rcp.Application;
23 23
import org.txm.rcp.TxmPreferences;
tmp/org.txm.wordcloud.feature/feature.xml (revision 911)
67 67
   </license>
68 68

  
69 69
   <requires>
70
      <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
71

  
72
      <import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/>
73
      <import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/>
74
      <import plugin="org.txm.chartsengine.core" version="1.0.0" match="greaterOrEqual"/>
75
      <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
76
      <import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
77 70
      <import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/>
78
      <import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/>
79 71
      <import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
80 72
      <import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/>
81
      <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
82
      <import plugin="org.eclipse.osgi" version="3.10.2" match="greaterOrEqual"/>
83 73
      <import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/>
84
      <import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/>
85 74
      <import plugin="org.txm.wordcloud.core" version="1.0.0" match="greaterOrEqual"/>
86 75
      <import plugin="org.txm.chartsengine.rcp"/>
87 76
   </requires>
tmp/org.txm.dictionary.feature/feature.xml (revision 911)
17 17
   </license>
18 18

  
19 19
   <requires>
20
      <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
21
      <import plugin="org.txm.libs.groovy-all" version="2.3.3" match="greaterOrEqual"/>
22
      <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
23
      <import plugin="org.eclipse.ui"/>
24
      <import plugin="org.eclipse.core.runtime"/>
25
      <import plugin="org.txm.utils"/>
26
      <import plugin="org.eclipse.persistence.jpa" version="2.6.0" match="greaterOrEqual"/>
27
      <import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/>
28 20
      <import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/>
29
      <import plugin="javax.persistence"/>
30 21
      <import plugin="org.txm.annotation.kr.core" version="1.0.0" match="greaterOrEqual"/>
31 22
   </requires>
32 23

  
tmp/org.txm.wordcloud.rcp/META-INF/MANIFEST.MF (revision 911)
1 1
Manifest-Version: 1.0
2
Require-Bundle: org.txm.rcp;bundle-version="0.7.8";visibility:=reexpor
3
 t,org.txm.utils;bundle-version="1.0.0";visibility:=reexport,org.eclip
4
 se.osgi;bundle-version="3.10.2";visibility:=reexport,org.txm.index.rc
5
 p;bundle-version="1.0.0";visibility:=reexport,org.eclipse.core.runtim
6
 e;bundle-version="3.10.0";visibility:=reexport,org.txm.chartsengine.c
7
 ore;bundle-version="1.0.0";visibility:=reexport,org.txm.index.core;bu
8
 ndle-version="1.0.0";visibility:=reexport,org.txm.searchengine.cqp.co
9
 re;bundle-version="1.1.0";visibility:=reexport,org.eclipse.ui;bundle-
10
 version="3.106.1";visibility:=reexport,org.txm.wordcloud.core;bundle-
11
 version="1.0.0";visibility:=reexport,org.txm.chartsengine.rcp;visibil
12
 ity:=reexport,org.txm.core;bundle-version="0.7.0";visibility:=reexpor
13
 t
2
Require-Bundle: org.txm.index.rcp;bundle-version="1.0.0";visibility:=reexport,
3
 org.txm.wordcloud.core;bundle-version="1.0.0";visibility:=reexport,
4
 org.txm.chartsengine.rcp;visibility:=reexport
14 5
Bundle-Vendor: Textometrie.org
15 6
Bundle-ActivationPolicy: lazy
16 7
Bundle-Version: 1.0.0.qualifier
tmp/org.txm.tigersearch.feature/feature.xml (revision 911)
17 17
   </license>
18 18

  
19 19
   <requires>
20
      <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
21
      <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
22
      <import plugin="org.eclipse.ui"/>
23
      <import plugin="org.eclipse.core.runtime"/>
24
      <import plugin="org.eclipse.ui.editors" version="3.8.200" match="greaterOrEqual"/>
25
      <import plugin="org.eclipse.swt"/>
26
      <import plugin="org.txm.searchengine.core" version="1.0.0" match="greaterOrEqual"/>
27
      <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
28
      <import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/>
29
      <import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
30
      <import plugin="org.txm.statsengine.r.rcp" version="1.0.0" match="greaterOrEqual"/>
31
      <import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/>
32 20
      <import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/>
33
      <import plugin="org.txm.utils"/>
34
      <import plugin="org.txm.groovy.core" version="1.0.0" match="greaterOrEqual"/>
35 21
   </requires>
36 22

  
37 23
   <plugin
tmp/org.txm.cql2lsa.feature/feature.xml (revision 911)
17 17
   </license>
18 18

  
19 19
   <requires>
20
      <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
21 20
      <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
22
      <import plugin="org.eclipse.ui"/>
23
      <import plugin="org.eclipse.core.runtime"/>
24
      <import plugin="org.eclipse.ui.editors" version="3.8.100" match="greaterOrEqual"/>
25
      <import plugin="org.eclipse.core.expressions" version="3.4.500" match="greaterOrEqual"/>
26
      <import plugin="org.txm.index.core"/>
27
      <import plugin="org.txm.lexicaltable.core"/>
28
      <import plugin="org.txm.statsengine.r.core"/>
29 21
   </requires>
30 22

  
31 23
   <plugin
tmp/org.txm.tigersearch.rcp/.settings/org.eclipse.jdt.groovy.core.prefs (revision 911)
1 1
eclipse.preferences.version=1
2
groovy.compiler.level=23
2
groovy.compiler.level=-1
3
groovy.script.filters=**/*.dsld,y,**/*.gradle,n
tmp/org.txm.tigersearch.rcp/META-INF/MANIFEST.MF (revision 911)
131 131
 ls.debugger,org.mozilla.javascript.tools.idswitch,org.mozilla.javascr
132 132
 ipt.tools.jsc,org.mozilla.javascript.tools.shell,org.relaxng.datatype
133 133
 ,org.relaxng.datatype.helpers,org.txm.export.ts,org.txm.function.tige
134
 rsearch,org.txm.importer.srcmf,org.txm.importer.tigersearch,org.txm.s
134
 rsearch,org.txm.s
135 135
 earchengine.ts,org.txm.test,org.txm.tigersearch.commands,org.txm.tige
136 136
 rsearch.editors,org.w3c.css.sac,org.w3c.css.sac.helpers,org.w3c.dom.s
137 137
 mil,org.w3c.dom.svg,tigerAPI,tigerAPI.converters,tigerAPI.theories.ho
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/PunctInjectImport.groovy (revision 911)
1
// Script to restore punctuation etc. to results of TS query.
2
// Inputs:
3
// --- Tiger-XML Document node
4
// --- Java array:
5
// --- --- String [index][type] where:
6
// --- --- --- type == 0 gives the xml:id
7
// --- --- --- type == 1 gives the word form
8
// Process:
9
// --- Injects punctuation.
10
// Returns:
11
// --- Tiger-XML Document node.
12

  
13
// TXM package statement
14
package org.txm.importer.srcmf
15

  
16
import javax.xml.parsers.DocumentBuilderFactory
17

  
18

  
19

  
20
/**
21
 * 
22
 * @author tmr
23
 *
24
 */
25
class PunctInjectImport
26
{
27

  
28
	static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
29
	static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei')
30
	def tigerXml = null
31
	def txmIdWordTableFixed = []
32
	def outputFile = null
33

  
34
	/*
35
	 * Call this constructor when no need of XML-TXM file for word ID table
36
	 */
37
	PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) {
38
		def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
39
		tigerXml = builder.parse(tigerXmlFile).documentElement
40
		txmIdWordTableFixed = txmIdWordTable
41
		outputFile = new File("output.xml")
42
	}
43

  
44
	/*
45
	 * Call this constructor when the XML-TXM file is needed for the word list.    
46
	 */
47
	PunctInjectImport(File tigerXmlFile, File xmlTxmFile) {
48
		//
49
		def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
50
		tigerXml = builder.parse(tigerXmlFile).documentElement
51
		txmIdWordTableFixed = []
52
		def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile)
53
		for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) {
54
			def anId = null
55
			anId = w.'@id'
56
			if (! anId) {
57
				anId = w.attribute(nsXml.id)
58
			}
59
			if (! anId) {
60
				println "Error: ID attribute not located in XML TXM file"
61
			}
62
			txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])]
63
		}
64
		// assert txmIdWordTableFixed.size() > 0
65
		outputFile = new File("output.xml")
66
	}
67

  
68
	def process () {
69
		def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed)
70
		def allTs = toList(this.tigerXml.getElementsByTagName('t'))
71
		def tAttrs = getTAttrNames(allTs[0])
72
		def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first()
73
		def addedWordIds = []
74
		int tIx = 0
75
		while (txmIdWordTable) {
76
			def word = txmIdWordTable.remove(0)
77
			if (tIx == allTs.size()) {
78
				// End of TS file, but still words left in the BFM file.
79
				addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append')
80
			}
81
			else {
82
				def tNode = allTs[tIx]
83
				def tId = getTNodeId(tNode)
84
				if (tId == word[0] && tNode.getAttribute('word') == word[1]) {
85
					// alles gut
86
					tIx += 1
87
				}
88
				else if (tId == word[0]) {
89
					println("Mismatched Ids! ($tId)")
90
					tIx += 1
91
				}
92
				else if (['#', '*'].contains(tNode.getAttribute('word')) ) {
93
					// SRCMF duplicata; try comparing word against the next tNode next time
94
					// around.
95
					txmIdWordTable.add(0, word)
96
					tIx += 1
97
				}
98
				// Check that the SRCMF corpus doesn't have a bug in it...
99
				else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
100
				&& (allTs[0..tIx - 1].find{
101
					it.getAttribute('id') == "$tNodeIdPrefix#$tId"
102
				})) {
103
					println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
104
					txmIdWordTable.add(0, word)
105
					tIx += 1
106
				}
107
				// Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change)
108
				else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
109
				&& (allTs[tIx..-1].find{
110
					it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}"
111
				})) {
112
					println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
113
					txmIdWordTable.add(0, word)
114
					tIx += 1
115
				}
116
				else if (addedWordIds.contains(tId)) {
117
					println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
118
					txmIdWordTable.add(0, word)
119
					tIx += 1
120
				}
121
				else {
122
					// Insert word.  In the first instance, it will have the same parent as
123
					// the tNode before which it's being inserted.
124
					addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before')
125
					addedWordIds.add(word[0])
126
				}
127
			}
128
		}
129
		// Second phase: move punctuation into previous sentence,
130
		// dependent on sequence.
131
		def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals'))
132
		for (def i = 1 ; i < allTerminalses.size() ; i++) {
133
			def ts = toList(allTerminalses[i].getElementsByTagName('t'))
134
			def startPunc = true
135
			def puncStack = []
136
			while (ts && startPunc) {
137
				if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) {
138
					puncStack.add(ts.remove(0))
139
				}
140
				else {
141
					startPunc = false
142
				}
143
			}
144
			// Now, treat the punctuation stack at the beginning of the sentence
145
			if ( puncStack ) {
146
				int moveLeft = 0
147
				// First, identify LAST instance of sentence-final punctuation.
148
				def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('')
149
				def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/
150
				if (matches.size() > 0) {
151
					moveLeft = puncString.lastIndexOf(matches[-1]) + 1
152
				}
153
				// Second, split pairs of straight quotes
154
				matches = puncString =~ /(""|'')/ //"
155
				if (matches.size() > 0) {
156
					moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max()
157
				}
158
				// Now, move moveLeft punctuation nodes to the end of the prev. sentence
159
				ts = toList(allTerminalses[i].getElementsByTagName('t'))
160
				for (def j = 0 ; j < moveLeft ; j++ ) {
161
					allTerminalses[i - 1].appendChild(ts[j])
162
				}
163
			}
164
		}
165
		outputFile.withWriter("UTF-8") { writer ->
166
			writer.println(this.tigerXml)
167
		}
168
	}
169

  
170
	private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
171
		def newTNode = tNode.getOwnerDocument().createElement('t')
172
		for (def anAttr : tAttrs) {
173
			if (anAttr == 'id') {
174
				newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}")
175
			}
176
			else if (anAttr == 'word') {
177
				newTNode.setAttribute('word', word[1])
178
			}
179
			else {
180
				newTNode.setAttribute(anAttr, '--')
181
			}
182
		}
183
		if (where == 'before') {
184
			tNode.getParentNode().insertBefore(newTNode, tNode)
185
		}
186
		else if (where == 'append') {
187
			tNode.getParentNode().appendChild(newTNode)
188
		}
189
		else {
190
			throw new IllegalArgumentException('Bad before value')
191
		}
192
	}
193
	def getTAttrNames(tNode) {
194
		def nodeMap = tNode.attributes
195
		def nameList = []
196
		for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) {
197
			nameList.add( nodeMap.item(i).nodeName )
198
		}
199
		return nameList
200
	}
201
	def getTNodeId(tNode) {
202
		return tNode.getAttribute('id').tokenize('#').last()
203
	}
204
	def toList(def iterable) {
205
		return iterable.findAll {true};
206
	}
207
	def getDeepText(def node) {
208
		def str = ""
209
		node.children().each { child ->
210
			if (child instanceof Node) {
211
				str += getDeepText(child)
212
			} else if (child instanceof String) {
213
				str += child
214
			}
215
		}
216
		return str
217
	}
218

  
219
	public static void main(String[] args)
220
	{
221
		def cli = new CliBuilder(
222
				usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml'
223
				)
224
		cli.h(longOpt:'help', 'Prints this message.')
225
		cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
226
		def options = cli.parse(args)
227
		def tigerXmlFile = null
228
		def xmlTxmFile = null
229
		if (options.arguments().size() == 2) {
230
			tigerXmlFile = new File(options.arguments()[0])
231
			xmlTxmFile = new File(options.arguments()[1])
232
		} else {
233
			println 'Incorrect number of command line arguments... exiting'
234
			println cli.usage()
235
			System.exit(2)
236
		}
237
		def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile)
238
		if (options.o) {
239
			pii.outputFile = new File(options.o)
240
		}
241
		pii.process()
242
	}
243
}
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/srcmfLoader.groovy (revision 911)
1
package org.txm.importer.srcmf
2

  
3
// srcdir
4
// txml/master.xml
5
// txml/*.xml
6
// txm/*.xml
7

  
8
// 1 import xml-txm
9
import ims.tiger.gui.tigerregistry.TIGERRegistry;
10
import ims.tiger.index.writer.IndexBuilderErrorHandler;
11
import ims.tiger.index.writer.SimpleErrorHandler;
12
import ims.tiger.index.writer.XMLIndexing;
13
import ims.tiger.system.*;
14

  
15
import javax.xml.stream.*;
16

  
17
import java.io.BufferedWriter;
18
import java.io.File;
19
import java.io.FileOutputStream;
20
import java.io.IOException;
21
import java.io.OutputStreamWriter;
22
import java.io.PrintStream;
23
import java.net.URL;
24
import java.util.HashMap;
25
import java.util.logging.FileHandler
26
import java.io.File;
27

  
28
import org.apache.log4j.BasicConfigurator;
29
import org.txm.export.ts.*;
30
import org.txm.importer.ApplyXsl2;
31
import org.txm.utils.xml.DomUtils;
32
import org.txm.importer.ValidateXml;
33
import org.txm.importer.XPathResult;
34
import org.txm.importer.xmltxm.compiler;
35
import org.txm.importer.xml.pager;
36
import org.txm.objects.*;
37
import org.txm.utils.*;
38
import org.txm.*;
39
import org.txm.scripts.teitxm.*;
40
import org.txm.metadatas.*;
41
import org.txm.utils.i18n.*;
42
import org.w3c.dom.Element
43
import org.xml.sax.SAXException;
44

  
45
import javax.xml.parsers.DocumentBuilderFactory
46

  
47
String userDir = System.getProperty("user.home");
48
boolean debug = false
49
def MONITOR;
50
BaseParameters params;
51
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
52
{	println "DEV MODE";//exception means we debug
53
	debug = true
54
	params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml"))
55
	params.load()
56
	if (!org.txm.Toolbox.isInitialized()) {
57
		Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
58
		Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
59
		Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
60
		Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
61
		Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
62
		Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
63
		Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
64
	}
65
}
66
if (params == null) { println "no parameters. Aborting"; return; }
67

  
68
String corpusname = params.getCorpusName();
69
Element corpusElem = params.corpora.get(corpusname);
70
String basename = params.name;
71
String rootDir = params.rootDir;
72
String lang = corpusElem.getAttribute("lang");
73
String model = lang
74
String encoding = corpusElem.getAttribute("encoding");
75
boolean annotate = "true" == corpusElem.getAttribute("annotate");
76
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
77
def xslParams = params.getXsltParams(corpusElem);
78

  
79
File txmSrcDir = new File(rootDir, "xml-txm");
80
File tigerSrcDir = new File(rootDir, "tiger-xml");
81
File headerFile = new File(rootDir, "tiger-xml/header.xml");
82

  
83
File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename);
84

  
85
binDir.deleteDir();
86
binDir.mkdirs();
87
if (!binDir.exists()) {
88
	println "Could not create binDir "+binDir
89
	return;
90
}
91
if (!txmSrcDir.exists()) {
92
	println "Error: txm src dir does not exists: "+txmSrcDir
93
	return;
94
}
95
if (!tigerSrcDir.exists()) {
96
	println "Error: tiger src dir does not exists: "+tigerSrcDir
97
	return;
98
}
99
File txmDir = new File(binDir,"txm/$corpusname");
100
txmDir.deleteDir();
101
txmDir.mkdirs();
102
// copy txm files
103
List<File> srcfiles = txmSrcDir.listFiles();
104
for (File f : srcfiles) {// check XML format, and copy file into binDir
105
	if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties"))
106
		continue;
107
	if (ValidateXml.test(f)) {
108
		FileCopy.copy(f, new File(txmDir, f.getName()));
109
	} else {
110
		println "Won't process file "+f;
111
	}
112
}
113
if (txmDir.listFiles() == null) {
114
	println "No txm file to process"
115
	return;
116
}
117

  
118
List<File> filelist = txmDir.listFiles();
119
Collections.sort(filelist);
120

  
121
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
122
if (MONITOR != null) MONITOR.worked(25, "COMPILING")
123
println "-- COMPILING - Building Search Engine indexes"
124
println "binDir: $binDir"
125
println "txmDir: $txmDir"
126
def c = new compiler();
127
if (debug) c.setDebug();
128
c.setLang(lang);
129
//c.setSortMetadata(sortMetadata)
130
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
131
	println "import process stopped";
132
	return;
133
}
134

  
135
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
136
if (MONITOR != null) MONITOR.worked(20, "EDITION")
137
println "-- EDITION - Building edition"
138
new File(binDir,"HTML/$corpusname").deleteDir();
139
new File(binDir,"HTML/$corpusname").mkdirs();
140
File outdir = new File(binDir,"/HTML/$corpusname/default/");
141
outdir.mkdirs();
142

  
143

  
144
def second = 0
145
println "Paginating text: "
146
for (File srcfile : filelist) {
147
	String txtname = srcfile.getName();
148
	int i = txtname.lastIndexOf(".");
149
	if(i > 0) txtname = txtname.substring(0, i);
150
	List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
151
	List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
152
	Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
153
	if (second) { print(", ") }
154
	if (second > 0 && (second++ % 5) == 0) println ""
155
	print(srcfile.getName());
156
	def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
157
	Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
158
	for (i = 0 ; i < ed.getPageFiles().size();) {
159
		File f = ed.getPageFiles().get(i);
160
		String wordid = ed.getIdx().get(i);
161
		params.addPage(edition, ""+(++i), wordid);
162
	}
163
}
164

  
165
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
166
if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
167
File paramFile = new File(binDir, "import.xml");
168
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
169

  
170

  
171
// TMR Modifications start here
172
File tigerDir = new File(binDir, "tiger");
173
tigerDir.mkdir();
174
File master = new File(tigerSrcDir, "master.xml");
175
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file 
176

  
177
// Run SRCMF cmd line corpus import script 
178
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
179
def importer = new SrcmfImporter()
180
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
181

  
182
// Create TigerXml-POS-PNC
183
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
184
println "create TIGER XML POS PNC"
185
PunctInjectImport injector;
186
if (Toolbox.isInitialized()) {
187
	// get idx from CQP
188
	// INCOMPLETE
189
	// def injector = new PunctInjectImport(tmpFile, words)
190
	println "Error: the loader is not ready to be launched from RCP"
191
	return;
192
} else {
193
	injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
194
}
195
injector.outputFile = new File(tigerDir, "TigerPnc.xml")
196
injector.process()
197

  
198
// Create the tigersearch.logprop file (used to launch TIGERSearch)
199
File logprop = new File(tigerDir, "tigersearch.logprop");
200
logprop.withWriter("UTF-8") { writer ->
201
	writer.write("""# Default log configuration of the TIGERSearch suite
202

  
203
log4j.rootLogger=WARN,Logfile
204
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
205

  
206
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
207
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
208
log4j.appender.Logfile.MaxFileSize=500KB
209
log4j.appender.Logfile.MaxBackupIndex=1
210

  
211
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
212
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")	
213
}
214

  
215
// Run TigerRegistry
216
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
217
BasicConfigurator.configure();
218
String uri = tigerXmlAll.getAbsolutePath();
219
File tigerBinDir = new File(tigerDir, corpusname)
220
tigerBinDir.mkdir()
221
try {
222
	IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
223
	XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false);
224
	indexing.startIndexing();
225
}
226
catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
227
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
228

  
229
tigerXmlAll.delete()
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/TigerSubcorpus2Main.groovy (revision 911)
1
/*
2
Combines subcorpus files to give a single main file.
3
 */
4
// TXM package statement
5
package org.txm.importer.srcmf
6

  
7
// Command-line entry point
8

  
9
def cli = new CliBuilder(
10
    usage:'TigerSubcorpus2Main.groovy [options] tiger_master.xml'
11
)
12
cli.h(longOpt:'help', 'Prints this message.')
13
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
14
options = cli.parse(args)
15
if (options.arguments().size() != 1) {
16
    println 'Incorrect number of command line arguments... exiting'
17
    println cli.usage()
18
    System.exit(2)
19
}
20
def tigerMaster = new File(options.arguments()[0])
21
def outputFile = new File('outputfile.xml')
22
if (options.o) {
23
    outputFile = new File(options.o)
24
}
25
script(tigerMaster, outputFile)
26

  
27
def script (File tigerMaster, File outputFile) {
28
    def masterDoc = new XmlParser().parse(tigerMaster)
29
    for (def body : masterDoc.body) {
30
	for (def subNode : body.subcorpus) {
31
		File subCorpusFile = new File(tigerMaster.getParentFile(), (""+subNode.@external).substring(5))
32
		def subDoc = new XmlParser().parse(subCorpusFile)
33
		body.remove(subNode)
34
		for (def child : subDoc.children())
35
			body.append(child)
36
	}
37
    }
38
    def strWriter = new StringWriter()
39
    new groovy.util.XmlNodePrinter(new PrintWriter(strWriter)).print(masterDoc)
40
    def rez = strWriter.toString()
41
    outputFile.withWriter("UTF-8") { writer ->
42
	writer.println('<?xml version="1.0" encoding="UTF-8"?>')
43
	writer.print(rez)
44
    }
45
}
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/package.html (revision 911)
1
<html>
2
<body>
3
<p>TIGERSearch import module. This is a prototype that can only manage SRCMF TIGERSearch sources</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/MasterReheader.groovy (revision 911)
1
#! /usr/bin/groovy
2
package org.txm.importer.srcmf;
3

  
4
/*
5
 * To change this template, choose Tools | Templates
6
 * and open the template in the editor.
7
 */
8

  
9
// Set up globals
10
// def masterFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos.xml')
11
// def outputFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos2.xml')
12
// def headerFile = new File('/home/tomr/Documents/Work/SRCMF/srcmf_ts/header_srcmf.xml')
13
// def feats = [nt:['cat', 'type', 'coord'], t:['pos', 'form', 'q']]
14
// def firstFeat = [t:'word', nt:'cat']
15

  
16
// Command-line entry point
17
def cli = new CliBuilder(
18
    usage:'MasterReheader.groovy [options] master_file.xml header_file.xml'
19
)
20
cli.h(longOpt:'help', 'Prints this message.')
21
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
22
cli.nt(longOpt:'ntfeats', args:1, argName:'ntfeats', 'NT features for which to provide value node.')
23
cli.t(longOpt:'tfeats', args:1, argName:'tfeats', 'T features for which  to provide value node.')
24
cli.nt1(longOpt:'ntfeat1st', args:1, argName:'1st ntfeat', 'First NT feature listed in header.')
25
cli.t1(longOpt:'tfeat1st', args:1, argName:'1st tfeat', 'First T feature listed in header.')
26
options = cli.parse(args)
27
if (options.arguments().size() == 2) {
28
    def masterFile = new File(options.arguments()[0])
29
    def headerFile = new File(options.arguments()[1])
30
    def masterFolder = masterFile.getCanonicalFile().getParent()
31
    def outputFile = null
32
    if (options.o) {
33
        outputFile = new File(options.o)
34
    } else {
35
        outputFile = new File(masterFolder, 'MasterReheader_out.xml')
36
    }
37
    def ntfirst = 'cat'
38
    if (options.nt1) {
39
        ntfirst = options.nt1
40
    }
41
    def tfirst = 'word'
42
    if (options.t1) {
43
        tfirst = options.t1
44
    }
45
    script(
46
        masterFile, headerFile, outputFile, 
47
        ['nt':options.nts, 't':options.ts],
48
        ['nt':options.nt1, 't':options.t1]
49
    )    
50
} else {
51
    println 'Incorrect number of command line arguments... exiting'
52
    println cli.usage()
53
}
54

  
55
def script(
56
    File masterFile, File headerFile, File outputFile, HashMap feats, HashMap firstFeat
57
) {
58
    // Load master and header files
59
    def master = new XmlParser().parse(masterFile)
60
    def header = new XmlParser().parse(headerFile)
61
    def masterFolder = masterFile.getCanonicalFile().getParent()
62

  
63
    // Set up locals
64
    def attrVal = [nt:[:], t:[:]]
65

  
66
    // Scan subcorpus files and build attribute lists.
67
    master.body.subcorpus.each { 
68
        def subcorpusFile = new File (masterFolder, it.'@external'[5..-1])
69
        def subcorpus = new XmlParser().parse(subcorpusFile)
70
        // Closure for t & nt nodes processing.
71
        def getvals = { node, type ->
72
            node.attributes().each { mEntry ->
73
                if (! attrVal[type].keySet().contains(mEntry.getKey())) {
74
                    attrVal[type][mEntry.getKey()] = new HashSet()
75
                }
76
                attrVal[type][mEntry.getKey()].add(mEntry.getValue())
77
            }
78
        }
79
        subcorpus.s.graph.terminals.t.each { getvals.call(it, 't') }
80
        subcorpus.s.graph.nonterminals.nt.each { getvals.call(it, 'nt') }
81
    }
82
    // Id isn't an attribute in the header.
83
    attrVal['t'].remove('id')
84
    attrVal['nt'].remove('id')
85
    // Remove old feature nodes in master file
86
    def oldFeatureNodes = master.head.annotation.feature
87
    while (oldFeatureNodes) {
88
        node = oldFeatureNodes.pop()
89
        node.parent().remove(node)
90
    }
91
    assert (! master.head.annotation.feature)
92
    // Check firstFeat was relevant
93
    ['t', 'nt'].each { type ->
94
        if (! (attrVal[type].keySet().contains(firstFeat[type]))) {
95
            firstFeat[type] = attrVal[type].keySet().sort()[0]
96
        }
97
    }
98
    assert attrVal['t'].keySet().contains(firstFeat['t'])
99
    assert attrVal['nt'].keySet().contains(firstFeat['nt'])
100
    def featList = [:]
101
    ['t', 'nt'].each { type ->
102
        featList[type] = [firstFeat[type]]
103
        featList[type].addAll(attrVal[type].keySet().findAll { it != firstFeat[type] })
104
    }
105
    // Add new feature and value nodes
106
    ['t', 'nt'].each { type ->
107
        featList[type].each { feat ->
108
            def fNode = new Node(master.head.annotation[0], 'feature', 
109
                ['domain':type.toUpperCase(), 'name':feat]
110
            )
111
            // Add value node if the node value is given in 'feats'
112
            if (feats[type].contains(feat)) {
113
                attrVal[type][feat].each { value ->
114
                    assert header.'**'.feature
115
                    assert header.'**'.feature[0].'@name'
116
                    assert header.'**'.feature[0].'@domain'
117
                    assert ['NT', 'T'].contains(header.'**'.feature[0].'@domain')
118
                    def hFNode = header.'**'.feature.find {
119
                        it.'@name' == feat && (
120
                            it.'@domain' == type.toUpperCase() || it.'@domain' == 'FREC'
121
                        )
122
                    }
123
                    def vText = '[unknown]'
124
                    if (hFNode && hFNode.value.find { it.'@name' == value }) {
125
                        vText = hFNode.value.find { it.'@name' ==  value }.text()
126
                    }
127
                    new Node(fNode, 'value', ['name':value], vText)
128
                }
129
            }
130
        }    
131
    }
132

  
133
    // Save to output_file
134
    outputFile.withWriter { writer ->
135
        writer << groovy.xml.XmlUtil.serialize(master)
136
    }
137
}
138

  
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/SrcmfImporter.groovy (revision 911)
1
/*
2
 * Calls all import scripts
3
 */
4

  
5
/**
6
 *
7
 * @author tmr
8
 */
9

  
10
// Command line form of import statements:
11
// import MasterReheader
12
// import PunctInjectImport
13
// import SubcorpusDataInject
14
// import TigerSubcorpus2Main
15
// import java.util.logging.FileHandler
16
// import javax.xml.parsers.DocumentBuilderFactory
17

  
18
// TXM package statement
19
package org.txm.importer.srcmf
20

  
21
import java.util.logging.*;
22

  
23

  
24
// Command line entry point
25
def cli = new CliBuilder(
26
    usage:'SrcmfImport.groovy [options] tiger_master.xml xml_txm.xml header_file.xml'
27
)
28
cli.h(longOpt:'help', 'Prints this message.')
29
options = cli.parse(args)
30
if (options.arguments().size() != 3) {
31
    println 'Incorrect number of command line arguments... exiting'
32
    println cli.usage()
33
    System.exit(2)
34
}
35

  
36
def tigerFile = new File(options.arguments()[0])
37
def txmFile = new File(options.arguments()[1])
38
def headerFile = new File(options.arguments()[2])
39
def tigerXmlAll = doAllButPnc(
40
    tigerFile, 
41
    txmFile, 
42
    headerFile,
43
    txmFile.getAbsoluteFile().getParentFile().getParentFile()
44
)
45
doPnc(tigerXmlAll, txmFile)
46
tigerXmlAll.delete()
47

  
48
def doAllButPnc(File tigerFile, File txmFile, File headerFile, File binDir) {
49
    // Run pos injection script
50
    File txmSrcDir = txmFile.getAbsoluteFile().getParentFile()
51
    File tigerDir = new File(binDir, "tiger")
52
    tigerDir.mkdir()
53
    File masterpos = new File(tigerDir, "master_pos.xml")
54
    File xmltxm = txmSrcDir.listFiles()[0]
55
    File logFile = new File(binDir, "tiger.log")
56
    def sdi = new SubcorpusDataInject(
57
        xmltxm, 
58
        new FileHandler(logFile.getAbsolutePath()), "vers"
59
    )
60
    sdi.processMaster(tigerFile, masterpos)
61
    // Run reheader script
62
    def reheader = new MasterReheader()
63
    File tmp = File.createTempFile("tmp", ".xml",tigerDir)
64
    def feats = ['nt':['cat', 'type', 'coord'], 't':['pos', 'form', 'q']]
65
    def firstFeat = ['nt':'cat', 't':'word']
66
    reheader.script(masterpos, headerFile, tmp, feats, firstFeat)
67
    if (!tmp.exists()) {
68
    	println "Error: reheader failed"
69
    }
70
    masterpos.delete()
71
    tmp.renameTo(masterpos)
72
    // Run merge master & subcorpus script
73
    def tigerXmlAll = new File(masterpos.getParentFile(), "TigerAll.xml")
74
    def mergescript = new TigerSubcorpus2Main()
75
    mergescript.script(masterpos, tigerXmlAll)
76
    return tigerXmlAll
77
}
78

  
79
def doPnc(File tigerXmlAll, File txmFile) {
80
    injector = new PunctInjectImport(tigerXmlAll, txmFile)
81
    injector.outputFile = new File(tigerXmlAll.getParentFile(), "TigerPnc.xml")
82
    injector.process()
83
}
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/SubcorpusDataInject.groovy (revision 911)
1
#! /usr/bin/groovy
2
package org.txm.importer.srcmf;
3
/*
4
 * To change this template, choose Tools | Templates
5
 * and open the template in the editor.
6
 */
7

  
8
import java.util.logging.Logger
9
import java.util.logging.FileHandler
10
import java.util.logging.SimpleFormatter
11
import java.util.logging.Level
12

  
13

  
14
class SubcorpusDataInject {
15
	// Globals
16
	def textForm = 'prose'
17
	def TEI = null
18
	def wNodes = []
19
	def wNodesInQ = []
20
	// Set up namespaces
21
	static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
22
	static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm')
23
	// Set up a logger
24
	static logger = Logger.getLogger('subcorpusDataInjectLog')
25
	// END GLOBALS
26

  
27
	SubcorpusDataInject (
28
	File xmlTxmFile,
29
	FileHandler fh,
30
	String newTextForm
31
	) {
32
		// Reset global values
33
		if (newTextForm) {
34
			textForm = newTextForm
35
		}
36
		// Set up logger
37
		def formatter = new SimpleFormatter()
38
		logger.addHandler(fh)
39
		logger.setLevel(Level.ALL)
40
		fh.setFormatter(formatter)
41
		// Parse the XML-TXM file
42
		logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).")
43
		TEI = new XmlParser().parse(xmlTxmFile)
44
		// Get text form from TEI header (if expressed)
45
		logger.log(Level.INFO, "Reading TEI header to detect text form.")
46
		if (TEI.teiHeader.profileDesc.textClass.catRef) {
47
			def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1]
48
			textForm = TEI.teiHeader.encodingDesc.'**'.category.find {
49
				it.'@id' == catId || it.attribute(nsXml.id) == catId
50
			}.catDesc[0].text()
51
			logger.log(Level.INFO, "Found text form '$textForm' in TEI header.")
52
		} else {
53
			logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.")
54
		}
55
		// Convert Lbs to Property
56
		if (textForm == 'vers' || textForm == 'mixte') {
57
			logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.")
58
			def undone = TEI.text.body
59
			def lastLb = false
60
			def lastWord = null
61
			while (undone) {
62
				def aNode = undone.remove(0)
63
				if (aNode.name().getLocalPart() == 'lb') {
64
					lastLb = true
65
					if (lastWord) {
66
						lastWord.'@lb' = 'end'
67
					}
68
				} else if (aNode.name().getLocalPart() == 'w') {
69
					if (!punctTest(aNode)) {
70
						if (lastLb) {
71
							aNode.'@lb' = 'start'
72
						}
73
						lastWord = aNode
74
						lastLb = false
75
					}
76
				} else {
77
					undone.addAll(0, aNode.children().findAll { it.getClass() == Node })
78
				}
79
			}
80
		}
81
		// Build node lists
82
		logger.log(Level.INFO, "Building node lists.")
83
		wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' }
84
		wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' }
85
	}
86

  
87
	static main(def args) {
88
		// Parse command line args
89
		def outputFile = 'test-out.xml'
90
		def logName = 'subcorpusDataInjectLog.log'
91
		def cli = new CliBuilder(
92
				usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml'
93
				)
94
		cli.h(longOpt:'help', 'Prints this message')
95
		cli.m(longOpt:'master', 'Specifies input file is a TS master file.')
96
		cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.')
97
		cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.')
98
		cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file')
99
		println args
100
		def options = cli.parse(args)
101
		// Call XML TXM loader
102
		if (options.arguments().size() == 2) {
103
			def arglist = [new File (options.arguments()[0])]
104
			if (options.logfile) {
105
				arglist.add(new FileHandler(options.logfile))
106
			} else {
107
				arglist.add(new FileHandler(logName))
108
			}
109
			if (options.v) {
110
				arglist.add('vers')
111
			} else {
112
				arglist.add('')
113
			}
114
			def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2])
115
			// Call process master or process subcorpus
116
			arglist = [new File(options.arguments()[1]).getCanonicalFile()]
117
			if (options.output) {
118
				arglist.add(new File (options.output))
119
			} else {
120
				arglist.add(new File (outputFile))
121
			}
122
			if (options.m) {
123
				sdi.processMaster(arglist[0], arglist[1])
124
			} else {
125
				sdi.processSubcorpus(arglist[0], arglist[1])
126
			}
127
		} else {
128
			println "Wrong number of arguments."
129
			println cli.usage()
130
		}
131
	}
132

  
133
	def processMaster (File masterFile, File outputFile) {
134
		// Runs the transform on each subcorpus listed in a master file.
135
		// Resaves the master file.
136
		logger.log(Level.INFO, "Loading corpus master file")
137
		def masterFolder = ''
138
		def outputFolder = ''
139
		if (masterFile.getParent()) {
140
			// If not cwd
141
			masterFolder = masterFile.getParent()
142
		}
143
		if (outputFile.getParent()) {
144
			// If not cwd
145
			outputFolder = outputFile.getParent()
146
		}
147
		def master = new XmlParser().parse(masterFile)
148
		// Process each subcorpus file
149
		master.body.subcorpus.each {
150
			processSubcorpus(
151
					new File (masterFolder, it.'@external'[5..-1]),
152
					new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml")
153
					)
154
			it.'@external' = "${it.'@external'[0..-5]}_pos.xml"
155
		}
156
		logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}")
157

  
158
		outputFile.withWriter { writer ->
159
			writer << groovy.xml.XmlUtil.serialize(master)
160
		}
161
		logger.log(Level.INFO, "Run reheader on this file!")
162
	}
163

  
164
	def processSubcorpus (File subcorpusFile, File outputFile) {
165
		// Parse Subcorpus file
166
		logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}")
167
		def TS = new XmlParser().parse(subcorpusFile)
168
		// This is used as a stack
169
		def tNodes = TS.s.graph.terminals.t
170
		// This isn't
171
		def tNodesFixed = TS.s.graph.terminals.t
172

  
173
		// Build duplicata correspondance list
174
		logger.log(Level.INFO, "Building duplicata correspondance list")
175
		def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl ->
176
			def duplId = dupl.'@id'
177
			def realId = dupl.parent().parent().nonterminals.nt.edge.find {
178
				it.'@idref' == duplId
179
			}.parent().secedge.find {
180
				it.'@label' == 'dupl'
181
			}.'@idref'
182
			[duplId, realId]
183
		}
184
		logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.")
185

  
186
		// Add pos & form tags
187
		logger.log(Level.INFO, "Adding PoS and form tags to each t-node")
188
		// Get first wNode corresponding to a tNode
189
		def i = 0
190
		def wIx = null
191
		while (i < tNodes.size()) {
192
			// Avoid trying to find "fake" first words, it's very inefficient
193
			if (tNodes[i].'@word' != '#') {
194
				wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) })
195
			}
196
			if (wIx) {
197
				break
198
			} else {
199
				i++
200
			}
201
		}
202
		assert wIx, "$i"
203
		/*
204
		 * MAIN ITERATOR BEGINS HERE FOR POS INJECTION
205
		 */
206
		// Iterate over tNodes.
207
		while (tNodes) {
208
			def thisTNode = tNodes.remove(0)
209
			def thisTNodeId = thisTNode.'@editionId'
210
			def wNode = null
211
			def q = false
212
			// Define the PoS / q / form injecting closure
213
			def injectPos = {
214
				// Q tag
215
				if (q) {
216
					thisTNode.'@q' = 'y'
217
				} else {
218
					thisTNode.'@q' = 'n'
219
				}
220
				// Form tag
221
				if (textForm == 'prose') {
222
					thisTNode.'@form' = 'prose'
223
				} else if (
224
				textForm == 'vers' ||
225
				(
226
				textForm == 'mixte'
227
				&& wNode.parent().parent().name().getLocalPart() == 'ab'
228
				)
229
				) {
230
					if (! wNode.'@lb') {
231
						thisTNode.'@form' = 'vers'
232
					} else if (wNode.'@lb' == 'start') {
233
						thisTNode.'@form' = 'vers_debut'
234
					} else if (wNode.'@lb' == 'end')  {
235
						thisTNode.'@form' = 'vers_fin'
236
					}
237
				}
238
				// DEBUG TMR 22/08/2012
239
				else {
240
					thisTNode.'@form' = 'prose'
241
				}
242
				// END DEBUG TMR 22/08/2012
243
				// Find PoS tag in BFM file
244
				// Try txm:ana tags
245
				def txmAnaList = wNode[nsTxm.ana]
246
				// Then interp tags
247
				if (! txmAnaList) {
248
					txmAnaList = wNode.interp
249
				}
250
				// Try type="#pos_syn"
251
				if (txmAnaList.find {it.'@type' == '#pos_syn'}) {
252
					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text()
253
					// type attr of wNode
254
				} else if (! txmAnaList && wNode.'@type') {
255
					thisTNode.'@pos' = wNode.'@type'
256
					// type="#pos"
257
				} else if (txmAnaList.find { it.'@type' =  '#pos' }) {
258
					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text()
259
					// type="#fropos"
260
				} else if (txmAnaList.find { it.'@type' =  '#fropos' }) {
261
					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text()
262
				} else {
263
					thisTNode.'@pos' = '--'
264
				}
265
			}
266
			// End of injecting closure.
267
			// Check whether wNode matches tNode
268
			if (thisTNode.'@word' == '#') {
269
				// Not expecting to find these in BFM file, all OK.
270
				thisTNode.'@pos' = '--'
271
				thisTNode.'@form' = '--'
272
				thisTNode.'@q' = '--'
273
			} else if ( idmatch(thisTNode, wNodes[wIx]) ) {
274
				// Perfect match
275
				wNode = wNodes[wIx]
276
				q = wNodesInQ.contains(wNode)
277
				injectPos.call()
278
				wIx++
279
			} else if (thisTNode.'@word' == '*') {
280
				// Duplicata, use dupl match list to find correct wNode
281
				def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' }
282
				if (duplIdMatch) {
283
					def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] }
284
					// To enhance performance, I assume initially that the wNode matching the
285
					// duplicata is within +- 10 words of the current position in the
286
					// list.  The wNodes list is very long, and searches over the entire
287
					// list are avoided.
288
					assert matchTNode, "$duplIdMatch"
289
					if (wIx + 10 < wNodes.size() && wIx - 10 > 0) {
290
						wNode = wNodes[wIx-10 .. wIx+10].find {
291
							idmatch(matchTNode, it)
292
						}
293
					}
294
					// Use whole list if it failed.
295
					if (! wNode) {
296
						wNode = wNodes.find {
297
							idmatch(matchTNode, it)
298
						}
299
					}
300
					q = wNodesInQ.contains(wNode)
301
					injectPos.call()
302
					// Don't update wIx value!
303
				} else {
304
					thisTNode.'@pos' = '--'
305
					thisTNode.'@form' = '--'
306
					thisTNode.'@q' = '--'
307
					logger.log(
308
							Level.WARNING,
309
							"<w/> node matching TS duplicata $thisTNodeId not found."
310
							)
311
				}
312
			}  else if (punctTest(wNodes[wIx])) {
313
				// not expecting to find these in TS file; try same tNode again.
314
				tNodes.add(0, thisTNode)
315
				wIx++
316
			} else if (!
317
			wNodes.find {
318
				idmatch (thisTNode, it)
319
			}
320
			) {
321
				// Check the SRCMF corpus hasn't acquired an extra word
322
				logger.log(
323
						Level.WARNING,
324
						"TS word $thisTNodeId (${thisTNode.'@word'}) is " +
325
						"absent from BFM file."
326
						)
327
				thisTNode.'@pos' = '--'
328
				thisTNode.'@form' = '--'
329
				thisTNode.'@q' = '--'
330
			} else {
331
				// it's doesn't match, it's not a duplicata,
332
				// it's not BFM punctuation, but it IS found somewhere in the BFM
333
				// corpus.
334
				logger.log(
335
						Level.WARNING,
336
						"Discontinuity in TS corpus before word $thisTNodeId " +
337
						"(${thisTNode.'@word'})."
338
						)
339
				wNode = wNodes.find {
340
					idmatch(thisTNode, it)
341
				}
342
				q = wNodesInQ.contains(wNode)
343
				injectPos.call()
344
				wIx = wNodes.indexOf(wNode) + 1
345
			}
346
		}
347
		/*
348
		 * End Main iteration
349
		 */
350
		/*******************
351
		 * Recalculate headpos 
352
		 */
353
		logger.log(Level.INFO, "Recalculating headpos.")
354
		TS.s.graph.nonterminals.nt.each { nt ->
355
			def a = nt.edge.findAll { it.'@label' == 'L' }
356
			if (a.size() == 0) {
357
				nt.'@headpos' = '--'
358
			} else if (a.size() == 1) {
359
				nt.'@headpos' = nt.parent().parent().terminals.t.find {
360
					it.'@id' == a[0].'@idref'
361
				}.'@pos'
362
			} else {
363
				for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) {
364
					if (
365
					a.each { edge ->
366
						nt.parent().parent().terminals.t.find {
367
							it.'@id' == edge.'@idref'
368
						}.'@pos'
369
					}.contains(headpos)
370
					) {
371
						nt.'@headpos' = headpos + '?'
372
						break
373
					}
374
				}
375
			}
376
			assert nt.'@headpos'
377
		}
378
		logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}")
379

  
380
		outputFile.withWriter { writer ->
381
			writer << groovy.xml.XmlUtil.serialize(TS)
382
		}
383
	}
384
	// Subroutine self-test to establish whether a Groovy node is punctuation
385
	private punctTest(gNode) {
386
		if (gNode.'@type') {
387
			gNode.'@type'.toLowerCase().startsWith('pon')
388
		} else {
389
			if (gNode[nsTxm.ana]) {
390
				return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null
391
			} else if (gNode.interp) {
392
				return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null
393
			} else return false
394
		}
395
	}
396

  
397
	private idmatch (tNode, wNode) {
398
		// Matches a TS node with a <w/> node
399
		def idMatch = false
400
		def wNodeId = wNode.'@id'
401
		if (! wNodeId) {
402
			wNodeId = wNode.attribute(nsXml.id)
403
		}
404
		def tNodeId = tNode.'@editionId'
405
		def regex = /(\d+)([_a-z]+)?$/
406
		def m1 = (tNodeId =~ regex)
407
		def m2 = (wNodeId =~ regex)
408
		if ( m1.find() && m2.find() ) {
409
			idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger())
410
		} else {
411
			logger.log(
412
					Level.WARNING,
413
					"Idmatch routine can't find integer part of one or both of the" +
414
					"paired Ids for nodes $tNode, $wNode"
415
					)
416
		}
... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff