Revision 187

tmp/org.txm.core/default.xml (revision 187)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<workspace name="default">
3
   <projects>
4
      <project name="default">
5
         <bases>
6
         </bases>
7

  
8
         <corpora>
9
         </corpora>
10
      </project>
11
   </projects>
12
</workspace>
0 13

  
tmp/org.txm.core/.project (revision 187)
1
<?xml version="1.0" encoding="UTF-8"?>
2
<projectDescription>
3
	<name>org.txm.core</name>
4
	<comment></comment>
5
	<projects>
6
	</projects>
7
	<buildSpec>
8
		<buildCommand>
9
			<name>org.eclipse.jdt.core.javabuilder</name>
10
			<arguments>
11
			</arguments>
12
		</buildCommand>
13
		<buildCommand>
14
			<name>org.eclipse.pde.ManifestBuilder</name>
15
			<arguments>
16
			</arguments>
17
		</buildCommand>
18
		<buildCommand>
19
			<name>org.eclipse.pde.SchemaBuilder</name>
20
			<arguments>
21
			</arguments>
22
		</buildCommand>
23
		<buildCommand>
24
			<name>net.sourceforge.metrics.builder</name>
25
			<arguments>
26
			</arguments>
27
		</buildCommand>
28
		<buildCommand>
29
			<name>com.stateofflow.eclipse.metrics.MetricsBuilder</name>
30
			<arguments>
31
			</arguments>
32
		</buildCommand>
33
	</buildSpec>
34
	<natures>
35
		<nature>org.eclipse.jdt.groovy.core.groovyNature</nature>
36
		<nature>org.eclipse.pde.PluginNature</nature>
37
		<nature>org.eclipse.jdt.core.javanature</nature>
38
		<nature>net.sourceforge.metrics.nature</nature>
39
		<nature>com.stateofflow.eclipse.metrics.MetricsNature</nature>
40
	</natures>
41
</projectDescription>
0 42

  
tmp/org.txm.core/buildTST.properties (revision 187)
1
bin.includes = META-INF/,\
2
               .,\
3
               lib/
4
jars.compile.order = .,\
5
                     bin/
6
source.. = src/groovy/,\
7
           src/java/
8
jars.extra.classpath = lib/ant-1.7.1.jar,\
9
                       lib/antlr-2.7.7.jar,\
10
                       lib/asm-analysis-2.2.3.jar,\
11
                       lib/asm-tree-2.2.3.jar,\
12
                       lib/colt-1.2.0.jar,\
13
                       lib/commons-cli-1.2.jar,\
14
                       lib/commons-io-1.4.jar,\
15
                       lib/commons-lang-2.4.jar,\
16
                       lib/concurrent-1.3.4.jar,\
17
                       lib/hsqldb.jar,\
18
                       lib/jline-0.9.94.jar,\
19
                       lib/junit-4.5.jar,\
20
                       lib/log4j-1.2.12.jar,\
21
                       lib/REngine.jar,\
22
                       lib/RserveEngine.jar,\
23
                       lib/saxon-xom-9.2.jar
24
source.. = src/java/
25
output.. = bin/
0 26

  
tmp/org.txm.core/buildJavadoc.xml (revision 187)
1
<project name="Toolbox" default="doc">
2
	
3
	<path id="lib.dir">
4
		<!-- fileset : TO BE DEFINED MANUALLY -->
5
		<fileset dir="/home/mdecorde/LIBRAIRIES/groovy-2.2.1/embeddable">
6
			<include name="*.jar" />
7
		</fileset>
8
	</path>
9

  
10
	<tstamp>
11
		<format property="TODAY" pattern="yyyy-MM-dd HH:mm" />
12
	</tstamp>
13

  
14
	<taskdef name="groovydoc" classname="org.codehaus.groovy.ant.Groovydoc" classpathref="lib.dir" />
15
	
16
	<target name="doc">
17
		<mkdir dir="javadoc/" />
18
		<!-- packagenames : TO BE UPDATED MANUALLY using TBX exported packages list. BECAUSE org.txm.utils.tostring.ToString make groovydoc raise an exception ... -->
19
		<groovydoc 
20
			destdir="javadoc/" 
21
			sourcepath="src/java:src/groovy" 
22
			packagenames="filters.BuildXmlLiturgie,filters.Concatenator,filters.CutHeader,filters.EliminateWhiteSpaces,filters.FusionHeader,filters.FusionXmlHeaderBody,filters.MinimalFilter,filters.OneOpenTagPerLine,filters.OneTagPerLine,filters.ProcessEnclitics,filters.ProcessQuotes,filters.RegexFilter,filters.ReunitBrokenTags,filters.ReunitBrokenWords,filters.TagSentences,filters.Tokeniser,filters.WordInternalElement,filters.eliminateNAttributes,org.txm,org.txm.annotation,org.txm.doc,org.txm.export,org.txm.export.conll2009,org.txm.export.ts,org.txm.functions,org.txm.functions.ca,org.txm.functions.classification,org.txm.functions.concordances,org.txm.functions.concordances.comparators,org.txm.functions.contrasts,org.txm.functions.cooccurrences,org.txm.functions.cooccurrences.comparators,org.txm.functions.coocmatrix,org.txm.functions.diagnostic,org.txm.functions.internal,org.txm.functions.intertextualdistance,org.txm.functions.mesures,org.txm.functions.parabrowser,org.txm.functions.parallelcontexts,org.txm.functions.preview,org.txm.functions.queryindex,org.txm.functions.referencer,org.txm.functions.progression,org.txm.functions.selection,org.txm.functions.specificities,org.txm.functions.summary,org.txm.functions.index,org.txm.importer,org.txm.importer.RGAQCJ,org.txm.importer.alceste,org.txm.importer.bfm,org.txm.importer.bvh,org.txm.importer.corptef,org.txm.importer.csv,org.txm.importer.cwb,org.txm.importer.discours,org.txm.importer.doc,org.txm.importer.europress,org.txm.importer.factiva,org.txm.importer.filters,org.txm.importer.fleurs,org.txm.importer.frantext,org.txm.importer.graal,org.txm.importer.hyperbase,org.txm.importer.hyperprince,org.txm.importer.lasla,org.txm.importer.limsi,org.txm.importer.perrault,org.txm.importer.quick,org.txm.importer.scripting,org.txm.importer.tigersearch,org.txm.importer.tmx,org.txm.importer.transana,org.txm.importer.transcriber,org.txm.importer.wtc,org.txm.importer.xml,org.txm.importer.xmltxm,org.txm.importer.xmltxmpara,org.txm.metadatas,org.txm.objects,org.txm.renderer,org.txm.scripts,org.txm.scripts.clix,org.txm.scripts.i18n,org.txm.scripts.teitxm,org.txm.scripts.tigersearch,org.txm.searchengine.cqp,org.txm.searchengine.cqp.clientExceptions,org.txm.searchengine.cqp.corpus,org.txm.searchengine.cqp.corpus.query,org.txm.searchengine.cqp.serverException,org.txm.searchengine.ts,org.txm.setup,org.txm.sql,org.txm.stat,org.txm.stat.data,org.txm.stat.engine.r,org.txm.stat.engine.r.data,org.txm.stat.engine.r.function,org.txm.stat.engine.r.rcolt,org.txm.stat.utils,org.txm.svn,org.txm.sw,org.txm.test,org.txm.tests,org.txm.tokenizer,org.txm.toolbox,org.txm.utils,org.txm.utils.i18n,org.txm.utils.logger,org.txm.utils.processbuilder,org.txm.utils.saxon,org.txm.utils.treetagger" 
23
			use="true"  
24
			windowtitle="TXM Groovy and Java documentation ${TODAY}" 
25
			doctitle="TXM Groovy and Java documentation ${TODAY}" 
26
			header="TXM Groovy and Java documentation ${TODAY}" 
27
			footer="TXM Groovy and Java documentation ${TODAY}" 
28
			private="false">
29
			
30
			<link packages="java.,org.xml.,javax.,org.xml." href="http://java.sun.com/j2se/1.5.0/docs/api" />
31
			<link packages="org.apache.ant.,org.apache.tools.ant." href="http://www.dpml.net/api/ant/1.7.0" />
32
			<link packages="org.junit.,junit.framework." href="http://junit.sourceforge.net/junit3.8.1/javadoc/" />
33
			<link packages="groovy.,org.codehaus.groovy." href="http://groovy.codehaus.org/api/" />
34
		</groovydoc>
35
	</target>
36
</project>
0 37

  
tmp/org.txm.core/src/groovy/filters/FusionXmlHeaderBody/package.html (revision 187)
1
<html>
2
<body>
3
<p>Deprecated package: Filter to restore the XML-TEI header of XML-TEI files</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.core/src/groovy/filters/FusionXmlHeaderBody/FusionXmlHeaderBody.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.FusionXmlHeaderBody;
29

  
30
import org.txm.importer.filters.*;
31
import org.xml.sax.InputSource
32
import org.xml.sax.helpers.ParserAdapter
33
import org.xml.sax.helpers.*;
34
import javax.xml.parsers.SAXParserFactory;
35
import javax.xml.parsers.SAXParser;
36
import java.io.Reader;
37
import groovy.xml.XmlUtil
38

  
39
import javax.xml.transform.*;
40
import javax.xml.transform.stream.StreamResult;
41
import javax.xml.transform.stream.StreamSource;
42

  
43
import org.xml.sax.helpers.DefaultHandler
44

  
45
// TODO: Auto-generated Javadoc
46
/**
47
 * The Class FusionXmlHeaderBody.
48
 */
49
class FusionXmlHeaderBody extends Filter
50
{
51
	
52
	/** The body file. */
53
	String bodyFile = "";
54
	
55
	/** The header file. */
56
	String headerFile = "";
57
	
58
	/** The Lastinput. */
59
	Reader Lastinput;
60
	
61
	/* (non-Javadoc)
62
	 * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
63
	 */
64
	void SetUsedParam(Object args)
65
	{
66
		
67
	}
68
	
69
	/* (non-Javadoc)
70
	 * @see org.txm.importer.filters.Filter#before()
71
	 */
72
	boolean before()
73
	{
74
		if(bodyFile == "" || headerFile == "")
75
		{
76
			System.out.println("FusionXmlHeaderBody need 2 args : \nString:XmlBodyFile\nString:XmlHeaderFile");
77
		}
78
		System.out.println("start fusionxmlheaderbody "+this);
79
		Lastinput = this.input;
80
		this.input = new BufferedReader(new java.io.StringReader("") );
81
	}
82
	
83
	/**
84
	 * Sets the used param.
85
	 *
86
	 * @param args the args
87
	 */
88
	void SetUsedParam(Object[] args)
89
	{
90
		if(args.size() == 2)
91
		{
92
			bodyFile = args[0];//needed to get back its ID
93
			headerFile = args[1];
94
		}
95
		else
96
		{
97
			System.out.println("FusionXmlHeaderBody need 2 args : \nString:XmlBodyFile\nString:XmlHeaderFile");
98
		}
99
	}
100
	
101
	/* (non-Javadoc)
102
	 * @see org.txm.importer.filters.Filter#after()
103
	 */
104
	void after()
105
	{
106
		this.input = this.Lastinput;
107
		//System.out.println("input ready "+this.input.ready());
108
		def body = new XmlSlurper().parse(this.input);
109
		
110
		def headers = new XmlSlurper().parse(headerFile)
111
		
112
		def id = bodyFile.split('\\.')[0]
113
		def header = headers.discours.find{it.@file == id}
114
		
115
		this.output.withWriter{writer ->
116
			def xmlBuilder = new groovy.xml.StreamingMarkupBuilder();
117
			def xml = xmlBuilder.bind{
118
				mkp.xmlDeclaration();
119
				top (header <<body.children());	
120
			};
121
		
122
			writer <<XmlUtil.serialize(xml);
123
		}		
124
	}
125
	
126
	/* (non-Javadoc)
127
	 * @see org.txm.importer.filters.Filter#filter()
128
	 */
129
	void filter()
130
	{
131
	}
132
	
133
}
tmp/org.txm.core/src/groovy/filters/FusionXmlHeaderBody/ReaderInputStream.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.FusionXmlHeaderBody;
29

  
30
/*
31
 * Copyright 2004-2005 The Apache Software Foundation.
32
 *
33
 *  Licensed under the Apache License, Version 2.0 (the "License");
34
 *  you may not use this file except in compliance with the License.
35
 *  You may obtain a copy of the License at
36
 *
37
 *      http://www.apache.org/licenses/LICENSE-2.0
38
 *
39
 *  Unless required by applicable law or agreed to in writing, software
40
 *  distributed under the License is distributed on an "AS IS" BASIS,
41
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
42
 *  See the License for the specific language governing permissions and
43
 *  limitations under the License.
44
 */
45
import java.io.IOException;
46
import java.io.InputStream;
47
import java.io.Reader;
48

  
49
// TODO: Auto-generated Javadoc
50
/**
51
 * Adapts a <code>Reader</code> as an <code>InputStream</code>. Adapted from
52
 * <CODE>StringInputStream</CODE>.
53
 */
54
public class ReaderInputStream extends InputStream {
55
	Reader input;
56

  
57
	/** The encoding. */
58
	String encoding = System.getProperty("file.encoding");
59

  
60
	/** The slack. */
61
	byte[] slack;
62

  
63
	/** The begin. */
64
	int begin;
65

  
66
	/**
67
	 * Construct a <CODE>ReaderInputStream</CODE> for the specified
68
	 * <CODE>Reader</CODE>.
69
	 * 
70
	 * @param reader
71
	 *            <CODE>Reader</CODE>. Must not be <code>null</code>.
72
	 */
73
	public ReaderInputStream(Reader reader) {
74
		input = reader;
75
	}
76

  
77
	/**
78
	 * Construct a <CODE>ReaderInputStream</CODE> for the specified
79
	 * <CODE>Reader</CODE>, with the specified encoding.
80
	 * 
81
	 * @param reader
82
	 *            non-null <CODE>Reader</CODE>.
83
	 * @param encoding
84
	 *            non-null <CODE>String</CODE> encoding.
85
	 */
86
	public ReaderInputStream(Reader reader, String encoding) {
87
		this(reader);
88
		if (encoding == null) {
89
			throw new IllegalArgumentException("encoding must not be null");
90
		} else {
91
			this.encoding = encoding;
92
		}
93
	}
94

  
95
	/**
96
	 * Reads from the <CODE>Reader</CODE>, returning the same value.
97
	 *
98
	 * @return the value of the next character in the <CODE>Reader</CODE>.
99
	 */
100
	public synchronized int read() throws IOException {
101
		if (input == null) {
102
			throw new IOException("Stream Closed");
103
		}
104

  
105
		byte result;
106
		if (slack != null && begin < slack.length) {
107
			result = slack[begin];
108
			if (++begin == slack.length) {
109
				slack = null;
110
			}
111
		} else {
112
			byte[] buf = new byte[1];
113
			if (read(buf, 0, 1) <= 0) {
114
				result = -1;
115
			}
116
			result = buf[0];
117
		}
118

  
119
		if (result < -1) {
120
			result += 256;
121
		}
122

  
123
		return result;
124
	}
125

  
126
	/**
127
	 * Reads from the <code>Reader</code> into a byte array.
128
	 *
129
	 * @param b the byte array to read into
130
	 * @param off the offset in the byte array
131
	 * @param len the length in the byte array to fill
132
	 * @return the actual number read into the byte array, -1 at the end of the
133
	 * stream
134
	 */
135
	public synchronized int read(byte[] b, int off, int len) throws IOException {
136
		if (input == null) {
137
			throw new IOException("Stream Closed");
138
		}
139

  
140
		while (slack == null) {
141
			char[] buf = new char[len]; // might read too much
142
			int n = input.read(buf);
143
			if (n == -1) {
144
				return -1;
145
			}
146
			if (n > 0) {
147
				slack = new String(buf, 0, n).getBytes(encoding);
148
				begin = 0;
149
			}
150
		}
151

  
152
		if (len > slack.length - begin) {
153
			len = slack.length - begin;
154
		}
155

  
156
		System.arraycopy(slack, begin, b, off, len);
157

  
158
		if ((begin += len) >= slack.length) {
159
			slack = null;
160
		}
161

  
162
		return len;
163
	}
164

  
165
	/**
166
	 * Marks the read limit of the StringReader.
167
	 * 
168
	 * @param limit
169
	 *            the maximum limit of bytes that can be read before the mark
170
	 *            position becomes invalid
171
	 */
172
	public synchronized void mark(final int limit) {
173
		try {
174
			input.mark(limit);
175
		} catch (IOException ioe) {
176
			throw new RuntimeException(ioe.getMessage());
177
		}
178
	}
179

  
180
	/**
181
	 * Available.
182
	 *
183
	 * @return the current number of bytes ready for reading
184
	 */
185
	public synchronized int available() throws IOException {
186
		if (input == null) {
187
			throw new IOException("Stream Closed");
188
		}
189
		if (slack != null) {
190
			return slack.length - begin;
191
		}
192
		if (input.ready()) {
193
			return 1;
194
		} else {
195
			return 0;
196
		}
197
	}
198

  
199
	/**
200
	 * Mark supported.
201
	 *
202
	 * @return false - mark is not supported
203
	 */
204
	public boolean markSupported() {
205
		return false; // would be imprecise
206
	}
207

  
208
	/**
209
	 * Resets the StringReader.
210
	 *
211
	 */
212
	public synchronized void reset() throws IOException {
213
		if (input == null) {
214
			throw new IOException("Stream Closed");
215
		}
216
		slack = null;
217
		input.reset();
218
	}
219

  
220
	/**
221
	 * Closes the Stringreader.
222
	 *
223
	 */
224
	public synchronized void close() throws IOException {
225
		if (input != null) {
226
			input.close();
227
			slack = null;
228
			input = null;
229
		}
230
	}
231
}
tmp/org.txm.core/src/groovy/filters/ProcessEnclitics/package.html (revision 187)
1
<html>
2
<body>
3
<p>Deprecated package: Filter to process enclitics in the BFM tokenizer</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.core/src/groovy/filters/ProcessEnclitics/ProcessEnclitics.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.ProcessEnclitics;
29

  
30
import org.txm.importer.filters.*;
31
import java.util.regex.*;
32
import org.txm.tokenizer.TokenizerClasses;
33

  
34
// TODO: Auto-generated Javadoc
35
/**
36
 * The Class ProcessEnclitics.
37
 */
38
class ProcessEnclitics extends Filter {
39
	
40
	/** The counter. */
41
	int counter;
42
	
43
	/** The enclitics. */
44
	String enclitics = TokenizerClasses.enclitics;
45

  
46
	/* (non-Javadoc)
47
	 * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
48
	 */
49
	void SetUsedParam(Object args)
50
	{
51
		try
52
		{
53
			enclitics = args.get("enclitics");
54

  
55
		}
56
		catch(Exception e)
57
		{
58
			System.err.println(e);
59
			System.err.println("Processenclitics needs 1 Map with arg  :\n enclitics")
60
		}
61
	}
62

  
63
	/* (non-Javadoc)
64
	 * @see org.txm.importer.filters.Filter#before()
65
	 */
66
	boolean before() {
67
		counter = 0;
68
		System.out.println("begin enclitics");
69
	}
70

  
71
	/* (non-Javadoc)
72
	 * @see org.txm.importer.filters.Filter#after()
73
	 */
74
	void after()
75
	{
76
		print "$counter enclitics with dashes found\n";
77
	}
78

  
79
	/* (non-Javadoc)
80
	 * @see org.txm.importer.filters.Filter#filter()
81
	 */
82
	void filter()
83
	{
84
		def m;
85
		def segment;
86
		// Write your code here, but don't forget to write in the output
87
		// ex : output.write("TheStringToWrite " + line );
88
		// in the var line is the current line
89
		if( (m = line =~ /\A\s*(<w[^>]*>)(.*)-($enclitics)<\/w>\Z/))
90
		{
91
			counter++;
92
			def word1_tag = (m[0][1]);
93
			def word1 = (m[0][2]);
94
			def word2 = (m[0][3]);
95
			output.write("$word1_tag$word1-</w>\n<w>$word2</w>"+lineSeparator);
96
		}
97
		else 
98
		{
99
			output.write(line+lineSeparator);
100
		}
101
		// End
102
	}
103
}
tmp/org.txm.core/src/groovy/filters/FusionHeader/FusionHeader.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.FusionHeader;
29

  
30
import java.io.File;
31

  
32
import java.io.BufferedReader;
33
import java.io.FileInputStream;
34
import java.io.InputStreamReader;
35
import org.txm.importer.filters.*;
36

  
37
// TODO: Auto-generated Javadoc
38
/**
39
 * The Class FusionHeader.
40
 */
41
class FusionHeader extends Filter {
42
	
43
	/** The xml header file. */
44
	String xmlHeaderFile = "";
45
	
46
	/** The first loop. */
47
	boolean firstLoop = true;
48
	
49
	/** The HEADER. */
50
	BufferedReader HEADER;
51
	
52
	/** The cut headerfilter. */
53
	Thread cutHeaderfilter;
54

  
55
	/* (non-Javadoc)
56
	 * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
57
	 */
58
	void SetUsedParam(Object args) {
59
		if (args.size() == 2) {
60
			xmlHeaderFile = args[0].toString();
61
			cutHeaderfilter = (Thread) args[1];
62
		} else {
63
			System.err
64
					.println("CutHeader needs 2 args\nString:xmlHEADERFilePath\nThread:cutHeaderFilter");
65
			System.exit(-1);
66
		}
67
	}
68

  
69
	/* (non-Javadoc)
70
	 * @see org.txm.importer.filters.Filter#after()
71
	 */
72
	void after() {
73
		try {
74
			HEADER.close();
75
			HEADER= null;
76
		} catch (Exception e) {
77
			System.err
78
					.println("Can't close Header file, one cause might be that the header file is not correctly built");
79
		}
80
	}
81

  
82
	/* (non-Javadoc)
83
	 * @see org.txm.importer.filters.Filter#before()
84
	 */
85
	boolean before() {
86
		if (xmlHeaderFile == "" || cutHeaderfilter == null) {
87
			System.err
88
					.println("CutHeader needs 2 args\nString:xmlHEADERFilePath\nThread:cutHeaderFilter");
89
		}
90
		// System.out.println("start FUSIONHEADER");
91

  
92
		return true;
93
	}
94

  
95
	/* (non-Javadoc)
96
	 * @see org.txm.importer.filters.Filter#filter()
97
	 */
98
	protected void filter()
99
	{
100
		if (firstLoop) {
101
			//println("BEGIN COPYING HEADER");
102
		
103
			String templine;
104
			File inputFile = new File(xmlHeaderFile)
105
			if ( !inputFile.exists()) {
106
				System.err.println("ERROR : DOES NOT EXISTS "+xmlHeaderFile);
107
			}
108
			HEADER = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), encodage));
109
			while (!HEADER.ready()) {
110
				Thread.sleep(10);
111
				//system.out.println("waiting for cutHeader to finish");
112
			}
113
			//HEADER = new FileReader("/home/ayepdieu/xml/header (copie).xml");
114
			//System.out.println("ready ? "+HEADER.ready());
115
			while ((templine = this.HEADER.readLine()) != null) {
116
				//println(templine+lineSeparator);
117
				output.print(templine+lineSeparator);
118
			}
119
			//println("FINISH COPYING HEADER");
120
			firstLoop=false;
121
			HEADER.close();
122
		}
123
		output.print(line+lineSeparator);
124
	}
125
}
tmp/org.txm.core/src/groovy/filters/FusionHeader/package.html (revision 187)
1
<html>
2
<body>
3
<p>Deprecated package: Filter to restore the XML-TEI header of XML-TEI files</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.core/src/groovy/filters/OneOpenTagPerLine/package.html (revision 187)
1
<html>
2
<body>
3
<p>Deprecated package: Filter to prepare the BFM tokenizer files</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.core/src/groovy/filters/OneOpenTagPerLine/OneOpenTagPerLine.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.OneOpenTagPerLine;
29

  
30
//Pre-processing extra-word tags (1)
31
import org.txm.importer.filters.Filter;
32
import org.txm.tokenizer.TokenizerClasses;
33

  
34
// TODO: Auto-generated Javadoc
35
/**
36
 * The Class OneOpenTagPerLine.
37
 */
38
class OneOpenTagPerLine extends Filter {
39
	
40
	/** The tag_all. */
41
	String tag_all = TokenizerClasses.tag_all;
42
	
43
	/** The counterreg1. */
44
	int counterreg1 = 0;
45
	
46
	/** The counterreg2. */
47
	int counterreg2 = 0;
48

  
49
	/* (non-Javadoc)
50
	 * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
51
	 */
52
	void SetUsedParam(Object args) {
53

  
54
	}
55

  
56
	/* (non-Javadoc)
57
	 * @see org.txm.importer.filters.Filter#before()
58
	 */
59
	boolean before() {
60

  
61
	}
62

  
63
	/* (non-Javadoc)
64
	 * @see org.txm.importer.filters.Filter#after()
65
	 */
66
	void after()
67
	{
68
		println "reg1 : $counterreg1";
69
		println "reg2 : $counterreg2";
70
	}
71

  
72
	/** The segment. */
73
	def segment;
74

  
75
	/* (non-Javadoc)
76
	 * @see org.txm.importer.filters.Filter#filter()
77
	 */
78
	void filter()
79
	{
80
		segment = line.trim()
81
		if(segment[0] == "<")
82
			output.write(lineSeparator+line);
83
		else
84
		output.write(" "+segment)
85
		/*if ( line ==~ /([^<])+>$/)
86
		{
87
			output.write(" "+line.trim())
88
		}
89
		else
90
		{
91
			output.write(lineSeparator+line);
92
		}*/
93
	}
94
}
tmp/org.txm.core/src/groovy/filters/ProcessQuotes/package.html (revision 187)
1
<html>
2
<body>
3
<p>Deprecated package: Filter to process quotes in the BFM tokenizer</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.core/src/groovy/filters/ProcessQuotes/ProcessQuotes.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.ProcessQuotes;
29

  
30
//Pre-processing extra-word tags (1)
31
import org.txm.importer.filters.*;
32
import java.util.regex.*;
33

  
34
// TODO: Auto-generated Javadoc
35
/**
36
 * The Class ProcessQuotes.
37
 */
38
class ProcessQuotes extends Filter {
39
	
40
	/** The counter. */
41
	int counter;
42
	
43
	/** The old. */
44
	def old;
45

  
46
	/* (non-Javadoc)
47
	 * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
48
	 */
49
	void SetUsedParam(Object args) {
50

  
51
	}
52

  
53
	/* (non-Javadoc)
54
	 * @see org.txm.importer.filters.Filter#before()
55
	 */
56
	boolean before() {
57
		counter = 0;
58
	}
59

  
60
	/* (non-Javadoc)
61
	 * @see org.txm.importer.filters.Filter#after()
62
	 */
63
	void after()
64
	{
65
		println "Replaced $counter quotes\n";
66
	}
67

  
68
	/* (non-Javadoc)
69
	 * @see org.txm.importer.filters.Filter#filter()
70
	 */
71
	void filter()
72
	{
73
		def m;
74
		def segment;
75

  
76
		old = line;
77
		line = (line =~ /&(quot|ldquo|rdquo);/).replaceAll("\"");
78
		
79
		if( old != line)
80
			counter++;
81
		
82
		output.write(line+lineSeparator);
83
	}
84
}
tmp/org.txm.core/src/groovy/filters/Concatenator/Concatenator.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

  
22
//
23
// This file is part of the TXM platform.
24
//
25
// The TXM platform is free software: you can redistribute it and/or modif y
26
// it under the terms of the GNU General Public License as published by
27
// the Free Software Foundation, either version 3 of the License, or
28
// (at your option) any later version.
29
//
30
// The TXM platform is distributed in the hope that it will be useful,
31
// but WITHOUT ANY WARRANTY; without even the implied warranty of
32
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33
// GNU General Public License for more details.
34
//
35
// You should have received a copy of the GNU General Public License
36
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
37
// 
38
// 
39
// 
40
// $LastChangedDate:$
41
// $LastChangedRevision:$
42
// $LastChangedBy:$ 
43
//
44
package filters.Concatenator;
45

  
46
import java.io.BufferedWriter;
47
import java.io.File;
48
import java.io.FileOutputStream;
49
import java.io.OutputStreamWriter;
50
import java.io.PrintStream;
51

  
52
import org.txm.importer.filters.*;
53

  
54
// TODO: Auto-generated Javadoc
55
/**
56
 * The Class Concatenator.
57
 */
58
class Concatenator extends Filter {
59
	
60
	/** The destination. */
61
	String destination;
62

  
63
	/* (non-Javadoc)
64
	 * @see org.txm.importer.filters.Filter#before()
65
	 */
66
	boolean before() {
67
		if (destination == "") {
68
			println("You need to define destination file before launching this filter");
69
			return false;
70
		}
71
		def f = new File(destination);
72
		this.output = new OutputStreamWriter(new FileOutputStream(f), "UTF-8");
73
		return true;
74
	}
75

  
76
	/* (non-Javadoc)
77
	 * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
78
	 */
79
	void SetUsedParam(Object args) {
80
		if (args.size() == 1) {
81
			destination = args[0];
82
		} else {
83
			System.out
84
					.println("Concatenator need 1 args : \nString:destinatione");
85
		}
86
	}
87

  
88
	/* (non-Javadoc)
89
	 * @see org.txm.importer.filters.Filter#after()
90
	 */
91
	void after() {
92
		System.out.println("Concatenator readed lines : " + linecounter);
93
		this.output.close();
94
	}
95

  
96
	/* (non-Javadoc)
97
	 * @see org.txm.importer.filters.Filter#filter()
98
	 */
99
	void filter() {
100
		output.write(line + lineSeparator);
101
		output.flush();
102
	}
103
}
tmp/org.txm.core/src/groovy/filters/Concatenator/package.html (revision 187)
1
<html>
2
<body>
3
<p>Deprecated package: Filter to concat 2 files</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.core/src/groovy/filters/RegexFilter/package.html (revision 187)
1
<html>
2
<body>
3
<p>Deprecated package: Filter to select lines with a regular expression</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.core/src/groovy/filters/RegexFilter/RegexFilter.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.RegexFilter;
29

  
30
import org.txm.importer.filters.*;
31
import java.util.regex.Matcher;
32
import java.util.regex.Pattern;
33

  
34
import org.txm.importer.filters.Filter;
35

  
36
// TODO: Auto-generated Javadoc
37
/** Filters out lines not matching a regexp. @author jmague */
38
public class RegexFilter extends Filter {
39
	private Pattern pattern;
40

  
41
	/* (non-Javadoc)
42
	 * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
43
	 */
44
	void SetUsedParam(Object args) {
45
		if (args.size() == 1) {
46
			pattern = Pattern.compile(args[0]);
47
		} else {
48
			System.out
49
					.println("Regexfilter needs 1 args :\nString:patternRegex");
50
		}
51
	}
52

  
53
	/* (non-Javadoc)
54
	 * @see org.txm.importer.filters.Filter#after()
55
	 */
56
	void after() {
57
	}
58

  
59
	/* (non-Javadoc)
60
	 * @see org.txm.importer.filters.Filter#before()
61
	 */
62
	boolean before() {
63
		if (pattern == null) {
64
			System.out
65
					.println("Regex filter needs 1 args :\nString:patternRegex");
66
			return false;
67
		}
68
		return true;
69
	}
70

  
71
	/* (non-Javadoc)
72
	 * @see org.txm.importer.filters.Filter#filter()
73
	 */
74
	void filter() {
75
		Matcher matcher = pattern.matcher(line);
76
		if (matcher.matches())
77
			output.write(line + lineSeparator);
78
	}
79
}
tmp/org.txm.core/src/groovy/filters/WordInternalElement/WordInternalElement.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.WordInternalElement;
29

  
30
import org.txm.importer.filters.*;
31
import java.util.regex.*;
32
import org.txm.tokenizer.*;
33

  
34
// TODO: Auto-generated Javadoc
35
/**
36
 * The Class WordInternalElement.
37
 */
38
class WordInternalElement extends Filter {
39
	
40
	/** The counter. */
41
	int counter;
42
	
43
	/** The old. */
44
	def old;
45
	
46
	/** The corr_tags_no_seg. */
47
	String corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
48

  
49
	/* (non-Javadoc)
50
	 * @see org.txm.importer.filters.Filter#SetUsedParam(java.lang.Object)
51
	 */
52
	void SetUsedParam(Object args)
53
	{
54
		try
55
		{
56
			corr_tags_no_seg = args.get("corr_tags_no_seg");
57

  
58
		}
59
		catch(Exception e)
60
		{
61
			System.err.println(e);
62
			System.err.println("wordinternal needs 1 Map with arg  :\n corr_tags_no_seg")
63
		}
64
	}
65

  
66
	/* (non-Javadoc)
67
	 * @see org.txm.importer.filters.Filter#before()
68
	 */
69
	boolean before()
70
	{
71
		counter = 0;
72
		println "begin wordinternal \n";
73
	}
74

  
75
	/* (non-Javadoc)
76
	 * @see org.txm.importer.filters.Filter#after()
77
	 */
78
	void after()
79
	{
80
		println "Deleted $counter wordinternalspaces \n";
81
	}
82

  
83
	/* (non-Javadoc)
84
	 * @see org.txm.importer.filters.Filter#filter()
85
	 */
86
	void filter()
87
	{
88
		def m;
89
		def segment;
90
		// Write your code here, but don't forget to write in the output
91
		// ex : output.write("TheStringToWrite " + line );
92
		// in the variable "line" is the current line value
93
		old = line;
94
		line = (line =~ /(<(corr_tags_no_seg) [^>]*word_part[^>]*>)\s*([^<]*?)\s*(<\/\2>)/).replaceAll("<seg type=\"word_part\">\$1\$3\$4<\\/seg>");
95
		
96
		if( old != line)
97
			counter++;
98
		
99
		output.write(line+lineSeparator);
100
		// End
101
	}
102
}
tmp/org.txm.core/src/groovy/filters/WordInternalElement/package.html (revision 187)
1
<html>
2
<body>
3
<p>Deprecated package: Filter of the BFM tokenizer</p>
4
</body>
5
</html>
0 6

  
tmp/org.txm.core/src/groovy/filters/TagSentences/TagSentences.groovy (revision 187)
1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package filters.TagSentences;
29

  
30
import java.util.regex.*
31

  
32
import org.txm.importer.filters.*
33
import org.txm.tokenizer.*
34

  
35
import filters.CutHeader.*
36
import filters.FusionHeader.*
37

  
38
// TODO: Auto-generated Javadoc
39
/**
40
 * The Class TagSentences.
41
 */
42
class TagSentences extends Filter {
43
	
44
	/** The counter. */
45
	int counter;
46
	
47
	/** The m. */
48
	def m;
49
	
50
	/** The segment. */
51
	def segment;
52
	
53
	/** The linetype. */
54
	def linetype = "out";
55
	
56
	/** The open_corr_tags. */
57
	def open_corr_tags = new LinkedList<String>();
58
	
59
	/** The LAS topen_corr_tags. */
60
	def LASTopen_corr_tags = "";
61
	
62
	/** The open_div_tags. */
63
	def open_div_tags = new LinkedList<String>();
64
	
65
	/** The LAST open_div_tags. */
66
	def LASTopen_div_tags = "";
67
	
68
	/** The corr. */
69
	def corr = "no";
70
	
71
	/** The corr_name. */
72
	def corr_name;
73
	
74
	/** The corr_tag. */
75
	def corr_tag;
76
	
77
	/** The pending. */
78
	def pending = "no";
79
	
80
	/** The scounter. */
81
	def scounter = 0;
82
	
83
	/** The _before. */
84
	def _before;
85
	
86
	/** The _after. */
87
	def _after;
88
	
89
	/** The div_tags. */
90
	def div_tags = TokenizerClasses.div_tags;
91
	
92
	/** The q_tags. */
93
	def q_tags = TokenizerClasses.q_tags;
94
	
95
	/** The corr_tags_no_seg. */
96
	def corr_tags_no_seg = TokenizerClasses.corr_tags_no_seg;
97
	
98
	/** The corr_tags. */
99
	def corr_tags = TokenizerClasses.corr_tags;
100
	
101
	def strongPunct = TokenizerClasses.punct_strong;
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff