/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 881

     <?xml version="1.0" encoding="UTF-8"?>
     <classpath>
     	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
     	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
     	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins">
     		<accessrules>
     			<accessrule kind="accessible" pattern="**"/>
     		</accessrules>
     	</classpathentry>
     	<classpathentry kind="src" path="src"/>
     	<classpathentry kind="output" path="bin"/>
     </classpath>

     Manifest-Version: 1.0
     Require-Bundle: org.txm.utils;bundle-version="1.0.0";visibility:=reexp
      ort,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,org
      .eclipse.core.runtime;bundle-version="3.10.0";visibility:=reexport,or
      g.txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexpo
      rt,org.txm.statsengine.r.core;visibility:=reexport,org.txm.lexicaltab
      le.core;bundle-version="1.0.0";visibility:=reexport,org.txm.statsengi
      ne.core;bundle-version="1.0.0";visibility:=reexport,org.txm.core;bund
      le-version="0.7.0";visibility:=reexport,org.txm.progression.core;bund
      le-version="1.0.0";visibility:=reexport,org.txm.chartsengine.core;bun
      dle-version="1.0.0";visibility:=reexport,org.txm.chartsengine.jfreech
      art.core;bundle-version="1.0.0";visibility:=reexport,org.txm.chartsen
      gine.r.core;bundle-version="1.0.0";visibility:=reexport
     Require-Bundle: org.txm.lexicaltable.core;bundle-version="1.0.0";visibility:=reexport,
      org.txm.progression.core;bundle-version="1.0.0";visibility:=reexport
     Export-Package: org.txm.functions.contrasts,
      org.txm.specificities.core.chartsengine.jfreechart,
      org.txm.specificities.core.chartsengine.r,

        </license>
        <requires>
           <import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.eclipse.osgi" version="3.10.2" match="greaterOrEqual"/>
           <import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/>
           <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.r.core"/>
           <import plugin="org.txm.lexicaltable.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
           <import plugin="org.txm.progression.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/>
           <import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.r.rcp"/>
           <import plugin="org.txm.chartsengine.rcp"/>
           <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
           <import plugin="org.eclipse.core.expressions" version="3.4.600" match="greaterOrEqual"/>
           <import plugin="org.txm.lexicaltable.rcp"/>
        </requires>

tmp/org.txm.core/.settings/org.eclipse.jdt.groovy.core.prefs (revision 881)
1	1	eclipse.preferences.version=1
2		groovy.compiler.level=23
	2	groovy.compiler.level=-1
3	3	groovy.script.filters=scripts/*/.groovy,y,src/main/resources/*/.groovy,y,src/test/resources/*/.groovy,y

     package org.txm.importer
     import javax.xml.stream.XMLStreamException
     import org.apache.tools.ant.types.resources.selectors.InstanceOf;
     import groovy.xml.*
     def root = new File("/home/mdecorde/xml/temoignagesnice/corpus Matrice - fichiers xmlisés/")
     File srcdir = new File(root, "orig");
     File outdir = new File(root, "tmp");
     File okdir = new File(root, "ok");
     File ok2dir = new File(root, "ok2");
     File temoignagedir = new File(root, "temoignages");
     ok2dir.deleteDir()
     ok2dir.mkdir()
     //rename title -> head
     for (def file : outdir.listFiles()) {
     	if (!file.getName().endsWith(".xml")) continue;
     	def doc = new XmlParser().parse(file);
     	for (def note : doc.body.chapter.title) {
     		println note
     		note.name = "head"
+    	}
     	new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
     		new XmlNodePrinter(new PrintWriter(writer)).print(doc)
+    	}
+    }
     /*
     // ADD chapter@title
     for (def file : outdir.listFiles()) {
     	if (!file.getName().endsWith(".xml")) continue;
     	def doc = new XmlParser().parse(file);
     	for (def chapter : doc.body.chapter) {
     		for (def title : chapter.title) {
     			chapter.@title = title.text()
     			break;
+    		}
+    	}
     	new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
     		new XmlNodePrinter(new PrintWriter(writer)).print(doc)
+    	}
+    }
     */
     //FIX figure and caption inclusions
     /*
     for (def file : outdir.listFiles()) {
     	if (!file.getName().endsWith(".xml")) continue;
     	def doc = new XmlParser().parse(file);
     	for (def note : doc.body."**".figure) {
     		note.name = "note"
     		//println "fig : $note"
     		//if ("Image :" == note.text()) {
     			def children = note.parent().children()
     			int i = children.indexOf(note)
     			//println i + " < "+children.size()
     			def nextChild = children[i+1]
     			if (nextChild != null && nextChild.name().toString() == "caption") {
     				println nextChild
     				note.value = "Images : "+note.text()// + " "+nextChild.text()
     				//println note
     				children.remove(i+1)
     				note.append(nextChild)
+    			}
     		//}
+    	}
     	new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
     		new XmlNodePrinter(new PrintWriter(writer)).print(doc)
+    	}
+    }
     */
     /*
     // DOCBOOK -> DOCBOOK TEXT ONLY
     for (def file : outdir.listFiles()) {
     	def doc = new XmlParser().parse(file);
     	def body = null
     	def bookinfo = null
     	def preface = null
     	for (def e : doc.body) body = e
     	for (def e : doc.bookinfo) {
     		doc.remove(e)
+    	}
     	for (def e : doc.preface) {
     		doc.remove(e)
+    	}
     	for (def e : doc.appendix) {
     		doc.remove(e)
+    	}
     	for (def e : doc.chapter) {
     		doc.remove(e)
+    	}
     	if (body == null) {
     		println "error text: "+file
     		continue
+    	}
     	new File(ok2dir, file.getName()).withWriter("UTF-8") { writer ->
     		new XmlNodePrinter(new PrintWriter(writer)).print(doc)
+    	}
     	//		writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
     	//			mkp.yield body
     	//		  })
+    }
     */
     //DOCBOOK to TEI
     /*
      for (def file : outdir.listFiles()) {
      def doc = new XmlParser().parse(file);
      def body = null
      //def bookinfo = null
      for (def e : doc.body) body = e
      //for (def e : doc.bookinfo) bookinfo = e
      //println body.getClass()
      if (body == null) {
      println "error text: "+file
      continue
+     }
      //	bookinfo.name = "teiHeader"
      body.name = "text"
      def teins = new groovy.xml.Namespace("http://www.tei-c.org/ns/1.0",'tei')
      //	for (def node : body."**") {
      //		if (node instanceof String) continue
      //		def name = node.name()
      //		if (name instanceof String)
      //			node.name = teins.get(name)
      //		else
      //			node.name = teins.get(name.getLocalPart())
      //	}
      for (def figure : body."**".figure) {
      figure.name = "note"
      figure.value = "Image : " + figure.caption.text()
+     }
      for (def chapter : body."**".chapter) {
      chapter.name = "div"
      chapter.@type = "chapter"
+     }
      for (def caption : body."**".title) {
      caption.name = "head"
+     }
      for (def para : body."**".para) {
      para.name = "p"
+     }
      def newdoc = new Node(null, "TEI");
      newdoc.@xmlns="http://www.tei-c.org/ns/1.0";
      newdoc.append(new Node(null, "teiHeader"))
      newdoc.append(body)
      new File(okdir, file.getName()).withWriter("UTF-8") { writer ->
      new XmlNodePrinter(new PrintWriter(writer)).print(newdoc)
+     }
      //		writer.print XmlUtil.serialize(new StreamingMarkupBuilder().bind {
      //			mkp.yield body
      //		  })
+     }
      */
     // remove TEI
     /*outdir.deleteDir()
      outdir.mkdir()
      def errors = []
      for (def file : srcdir.listFiles()) {
      if (file.isDirectory()) continue;
      //new EncodingConverter(file, "Windows-1252", "UTF-8")
      File outfile = new File(outdir, file.getName());
      outfile.withWriter("UTF-8") { writer ->
      file.eachLine("UTF-8") { line ->
      if (line.trim() == "<TEI>") {
      } else if (line.trim() == "</TEI>") {
      writer.println("</book>")
      } else if (line.trim() == "<book lang=\"fr\"/>") {
      writer.println("<book lang=\"fr\">")
      } else {
      writer.println(line)
+     }
+     }
+     }
      try {
      ValidateXml.testAndThrow(outfile);
      } catch (XMLStreamException e) {
      println file.getName() + " : "+ e.getMessage()
      errors << file
      if (e.getMessage().contains('Message: The element type "TEI" must be terminated by the matching end-tag "</TEI>"')) {
      println "Delete line : "+e.location.lineNumber
+     }
      println ""
+     }
+     }
      */
     println "done"
     //if (errors.size() > 0)
     //	println ""+errors.size()+" errors : $errors"
     //String content = file.getText("Windows-1252")
     //println content

     // Copyright © 2010-2013 ENS de Lyon.
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     //
     //
     // $LastChangedDate: 2011-10-19 17:50:26 +0200 (mer., 19 oct. 2011) $
     // $LastChangedRevision: 2038 $
     // $LastChangedBy: alavrentev $
     //
     package org.txm.importer
     import javax.xml.parsers.DocumentBuilder;
     import javax.xml.parsers.DocumentBuilderFactory;
     import javax.xml.parsers.ParserConfigurationException;
     import javax.xml.transform.OutputKeys;
     import javax.xml.transform.Result;
     import javax.xml.transform.Source;
     import javax.xml.transform.Transformer;
     import javax.xml.transform.TransformerFactory;
     import javax.xml.transform.dom.DOMSource;
     import javax.xml.transform.stream.StreamResult;
     import org.w3c.dom.Document;
     import org.w3c.dom.Element;
     import org.w3c.dom.NodeList;
     import org.xml.sax.SAXException;
     import javax.xml.stream.*;
     import java.io.File;
     import java.net.URL;
     // TODO: Auto-generated Javadoc
     /**
      * Extract w tags from a tei file
      * not finished.
+     *
      * @author mdecorde
      */
     class WExtractWithMode
+    {
     	/**
     	 * Process.
+    	 *
     	 * @param infile the infile
     	 * @param outfile the outfile
     	 * @param max the max
     	 * @return the java.lang. object
     	 */
     	public process(File infile, File outfile, String modemax)
+    	{
     		println "Process "+infile.getName()+", keep $modemax words"
     		int count = this.countW(infile);
     		int max = 0
     		String mode = ""
     		try {
     		mode = modemax.split("/")[0]
     		max = Integer.parseInt(modemax.split("/")[1])
     		}catch(Exception e ){}
     		if(count < max)
+    		{
     			println "can't extract $max words, the file "+infile.getName()+" contains only $count words"
     			return;
+    		}
     		//String ms = "#ms_K"
     		int part = 0;
     		if (mode == "3")
+    		{
     			part = max/3
+    		}
     		else if (mode == "2")
+    		{
     			part = max/2
+    		}
     		else if (mode == "1a" || mode == "1m" || mode == "1z")
+    		{
     			part = max
+    		}
     		else
+    		{
     			println "mode must be 1a, 1m, 1z, 2 or 3"
     			return
+    		}
     		int from1 = 0
     		int to1 = 0
     		if (mode != "1m" && mode != "1z")
+    		{
     			to1 = part
+    		}
     		int from2 = 0
     		int to2 = 0
     		if (mode == "3" || mode == "1m")
+    		{
     			from2 = (count/2) - (part/2);
     			to2 =(count/2) + (part/2);
+    		}
     		int from3 = 0
     		int to3 = 0
     		if (mode != "1a" && mode != "1m")
+    		{
     			from3 = count -part;
     			to3= count-1;
+    		}
     		boolean isSic = false;
     		boolean isW = false;
     		boolean isText = false;
     		boolean printW = true;
     		int wcount=0;
     		println " count : "+count
     		println "  get from "+from1+" to "+to1
     		println "  get from "+from2+" to "+to2
     		println "  get from "+from3+" to "+to3
     		String localname;
     		String prefix;
     		InputStream inputData = infile.toURI().toURL().openStream();
     		XMLInputFactory inputfactory = XMLInputFactory.newInstance();
     		XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
     		XMLOutputFactory factory = XMLOutputFactory.newInstance();
     		FileOutputStream output = new FileOutputStream(outfile)
     		XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8");
     		writer.writeStartDocument("utf-8", "1.0");
     		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
+    		{
     			if(isText)
+    			{
     				if((wcount >= from1 && wcount <= to1 )||
     				(wcount >= from2 && wcount <= to2) ||
     				(wcount >= from3 && wcount <= to3))
     					printW = true;
     				else
     					printW = false;
+    			}
     			else
     				printW = true;
     			switch (event)
+    			{
     				case XMLStreamConstants.START_ELEMENT:
     					localname = parser.getLocalName();
     					prefix = parser.getPrefix();
     				/*
     				 if(localname == "supplied")
     				 if(parser.getAttributeValue(null,"source") != null)
     				 ms = parser.getAttributeValue(null,"source")
     				 if(localname == "sic")
+    				 {
     				 isSic= true;
+    				 }
     				 */
     					if(localname == "text")
     						isText = true;
     					if(localname == "w")
+    					{
     						isW= true;
     						wcount++;
     						if(isText)
+    						{
     							if((wcount >= from1 && wcount <= to1 )||
     							(wcount >= from2 && wcount <= to2) ||
     							(wcount >= from3 && wcount <= to3))
     								printW = true;
     							else
     								printW = false;
+    						}
     						else
     							printW = true;
+    					}
     				/*if(!isSic)
     				 if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
     				 {*/
     					if(localname == "w")
+    					{
     						if(printW)
+    						{
     							if(prefix != null && prefix.length() > 0)
     								writer.writeStartElement(prefix+":"+localname);
     							else
     								writer.writeStartElement(localname);
     							for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
+    							{
     								if(parser.getAttributePrefix(i)!= "")
     									writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
     								else
     									writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
+    							}
     							//writer.writeAttribute("srcmf:src", ms);
+    						}
+    					}
     					else
+    					{
     						if(prefix != null && prefix.length() > 0)
     							writer.writeStartElement(prefix+":"+localname);
     						else
     							writer.writeStartElement(localname);
     						if(localname == "teiHeader")
+    						{
     							writer.writeAttribute("xmlns:me", "http://www.menota.org/ns/1.0");
     							writer.writeAttribute("xmlns:bfm", "http://bfm.ens-lsh.fr/ns/1.0");
     							//writer.writeAttribute("xmlns:srcmf", "https://listes.cru.fr/wiki/srcmf/index");
+    						}
     						if(localname == "TEI")
+    						{
     							writer.writeAttribute("xmlns","http://www.tei-c.org/ns/1.0");
+    						}
     						for(int i= 0 ; i < parser.getAttributeCount() ;i++ )
+    						{
     							if(parser.getAttributePrefix(i)!= "")
     								writer.writeAttribute(parser.getAttributePrefix(i)+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
     							else
     								writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
+    						}
+    					}
     				//}
     					break;
     				case XMLStreamConstants.END_ELEMENT:
     					localname =parser.getLocalName()
     				/*if(localname == "sic")
     				 isSic= false;
     				 if(localname == "w")
     				 isW= false;
     				 if(localname == "supplied" && ms != "#ms_K")
     				 ms = "#ms_K";
     				 if(!isSic)
     				 if(localname != "choice" && localname != "corr" && localname != "sic" && localname != "supplied" && localname != "seg")
     				 {*/
     					if(localname == "w")
+    					{
     						if(printW)
+    						{
     							writer.writeEndElement();
     							writer.writeCharacters("\n");
+    						}
+    					}
     					else
+    					{
     						writer.writeEndElement();
     						writer.writeCharacters("\n");
+    					}
     				//	}
     					break;
     				case XMLStreamConstants.CHARACTERS:
     				//if(!isSic)
     					if(isW)
+    					{
     						if(printW)
+    						{
     							writer.writeCharacters(parser.getText().trim());
+    						}
+    					}
     					else
     						writer.writeCharacters(parser.getText().trim());
     					break;
+    			}
+    		}
     		writer.flush();
     		writer.close();
     		output.close()
     		inputData.close();
+    	}
     	/**
     	 * Count w.
+    	 *
     	 * @param infile the infile
     	 * @return the int
     	 */
     	public int countW(File infile)
+    	{
     		InputStream inputData = infile.toURI().toURL().openStream();
     		XMLInputFactory inputfactory = XMLInputFactory.newInstance();
     		XMLStreamReader parser = inputfactory.createXMLStreamReader(inputData);
     		int count = 0;
     		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
+    		{
     			switch (event)
+    			{
     				case XMLStreamConstants.START_ELEMENT:
     					if(parser.getLocalName() == "w")
     						count++;
+    			}
+    		}
     		inputData.close()
     		return count;
+    	}
     	/**
     	 * The main method.
+    	 *
     	 * @param args the arguments
     	 */
     	public static void main(String[] args)
+    	{
     		String userDir = System.getProperty("user.home");
     		File directory = new File(userDir+"/xml/extract/");
     		File outdir = new File(userDir+"/xml/extract/","results");
     		outdir.mkdir();
     		File maxfilemode = new File(userDir+"/xml/extract/maxfilemode");
     		/*
     		 * maxfilemode format:
+    		 *
     		 * filename1.xml	3	45000
     		 * filename2.xml	1a	15000
     		 * filename3.xml	1m	15000
     		 * filename4.xml	1z	15000
     		 * filename5.xml	2	22500
     		 */
     		HashMap<File, String> maxperfile = new HashMap<File, String>();
     		maxfilemode.eachLine{it->
     			String[] split = it.split("\t");
     			if(split.length == 3)
+    			{
     				try
+    				{
     				String filename = it.split("\t")[0];
     				String modemax = it.split("\t")[1]+"/"+it.split("\t")[2]
     				maxperfile.put(filename, modemax);
     				}catch(Exception e ){}
+    			}
+    		}
     		println maxperfile;
     		def files = directory.listFiles();
     		for(File infile : files)
+    		{
     			if(maxperfile.containsKey(infile.getName()))
+    			{
     				File outfile = new File(outdir, infile.getName());
     				String modemax = maxperfile.get(infile.getName());
     				new WExtractWithMode().process(infile, outfile, modemax)
+    			}
+    		}
+    	}
+    }

     package org.txm.importer
     import org.txm.utils.CharsetDetector;
     import org.txm.importer.ValidateXml;
     File infile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.html")
     File outfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/odt.xml")
     String encoding = new CharsetDetector(infile).getEncoding();
     println "Encoding: $encoding"
     String text = infile.getText(encoding);
     //lower case tags
     text = text.replaceAll(/(<[^!][^>]*>)/,
     	{ full, word ->
     		//fix attributes TRUC=sdf234
     		word = word.replaceAll("([A-Z]+=)([^\" >]+)([ >])",'$1"$2"$3' )
     		word.toLowerCase() // bourrin
     	} )
     //lower case <.> tags
     text = text.replaceAll(/(<.>)/,
     	{ full, word ->
     		word.toLowerCase()
     	} )
     //resolve entities
     text = text.replaceAll(/&nbsp;/," ")
     //close tags
     text = text.replaceAll(/<br>/,"<br/>")
     text = text.replaceAll(/<meta([^>]*)>/,'<meta$1/>')
     text = text.replaceAll(/<img([^>]*)>/,'<img$1/>')
     //remove doctype declaration
     text = text.replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">', '')
     //write&Validate
     outfile.withWriter(encoding) { writer -> writer.write(text) }
     if (!ValidateXml.test(outfile)) {
     	println "FILE: $outfile"
+    }

     package org.txm.importer
     File dir = new File("/home/mdecorde/xml/voeux/split_xml")
     File outdir = new File("/home/mdecorde/xml/voeux/split_txtcsv")
     println "1) xml -> txt + write metadata.csv"
     outdir.deleteDir()
     outdir.mkdir()
     File metadatafile = new File(outdir, "metadata.csv")
     String csvString = ""
     def files = dir.listFiles()
     files.sort()
     for(File f : files)
+    {
     	File outfile = new File(outdir, f.getName()+".txt");
     	String text = f.getText("UTF-8");
     	String texttag = text.find("<text id.*>")
     //	println texttag
     //	texttag = texttag.replaceAll('<text id="([^"]+)"', '<text id="$1.txt"')
     //	println "> "+texttag
     	text = text.replaceAll("<text.*>", "").replace("</text>", "");
     	outfile.withWriter("UTF-8"){writer -> writer.write(text) }
     	csvString += texttag.replace("<text id=","").replace(" loc=", ",").replace(" annee=", ",").replace("\">", "\"")+"\n"
+    }
     println "2) write metadata.csv"
     metadatafile.withWriter("UTF-8"){csvwriter ->
     	csvwriter.write("\"id\",\"loc\",\"annee\"\n");
     	csvwriter.write(csvString)}
     println "3) rename Voeux_*"
     outdir.eachFileMatch(~/Voeux_.*/) {file-> file.renameTo(new File(outdir, file.getName().substring(6, 10)+".txt")) }

     // Copyright © 2010-2013 ENS de Lyon.
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     //
     //
     // $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
     // $LastChangedRevision: 3400 $
     // $LastChangedBy: mdecorde $
     //
     package org.txm.importer.xmltxm
     import java.text.DateFormat;
     import java.util.Date;
     import java.util.ArrayList;
     import javax.xml.stream.*;
     import java.net.URL;
     import org.txm.Toolbox;
     import org.txm.importer.filters.*;
     // TODO: Auto-generated Javadoc
     /**
      * The Class BuildTTSrc.
+     *
      * @author mdecorde
      * build the TT source for tigerSearch
      */
     public class BuildTTSrc {
     	/** The url. */
     	private def url;
     	/** The input data. */
     	private def inputData;
     	/** The factory. */
     	private def factory;
     	/** The parser. */
     	private XMLStreamReader parser;
     	/** The output. */
     	private BufferedWriter output;
     	/**
     	 * Instantiates a new builds the tt src.
     	 * uses XML-TXM V2
+    	 *
     	 * @param url the url of the file to process
     	 */
     	public BuildTTSrc(URL url) {
     		try {
     			this.url = url;
     			inputData = url.openStream();
     			factory = XMLInputFactory.newInstance();
     			parser = factory.createXMLStreamReader(inputData);
     		} catch (XMLStreamException ex) {
     			System.out.println(ex);
     		} catch (IOException ex) {
     			System.out.println("IOException while parsing ");
+    		}
+    	}
     	/**
     	 * Creates the output.
+    	 *
     	 * @param outfile the outfile
     	 * @return true, if successful
     	 */
     	private boolean createOutput(File outfile) {
     		try {
     			File f = outfile;
     			output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile),
     					"UTF-8"));
     			return true;
     		} catch (Exception e) {
     			System.out.println(e.getLocalizedMessage());
     			return false;
+    		}
+    	}
     	/**
     	 * Process.
+    	 *
     	 * @param outfile the outfile
          * @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found
     	 * @return true, if successful
     	 */
     	public boolean process(File outfile, String formtype) {
     		if (!createOutput(outfile))
     			return false;
     		boolean flagform = false; // to catch the content of the form tag
     		boolean firstform = false; // to know if its the first form of the w element
     		String form = ""; // the content of the form tag
     		String lastopenlocalname = "";
     		String localname = "";
     		StringBuffer buffer = new StringBuffer();
     		try {
     			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
     				switch (event) {
     					case XMLStreamConstants.START_ELEMENT:
     						localname = parser.getLocalName();
     						switch (localname) {
     							case "w":
     								//firstform = true;
     								break;
     							case "form":
     //								if (firstform) {
     //									if (formtype != null) {
     //										if(parser.getAttributeCount() > 0
     //											&& parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type
     //											flagform = true;
     //									}
     //									else
     									flagform = true;
     									form = "";
     									firstform = false;
     								//}
     								break;
     							case "s": // TreeTagger can use s tags
     								buffer.append("<s>\n");
     								break;
+    						}
     						break;
     					case XMLStreamConstants.END_ELEMENT:
     						localname = parser.getLocalName();
     						switch (localname) {
     							case "form":
     								flagform = false;
     								form = form.trim()
     								if (form.length() == 0) buffer.append("__EMPTY__\n");
     								else buffer.append(form.replace("\n", "").replace("<", "&lt;")+ "\n");
     								//buffer.append(form+ "\n"); // its a txt file no need to use entities
     								break;
     							case "s":
     								buffer.append("</s>\n");
     								break;
+    						}
     						break;
     					case XMLStreamConstants.CHARACTERS:
     						if (flagform) {
     							if (parser.getText().length() > 0)
     								form += parser.getText();
+    						}
     						break;
+    				}
+    			}
     			String str = buffer.toString()
     			if ("false".equals(Toolbox.getPreference(Toolbox.TREETAGGER_APOSTROPHE))) {
     				str = str.replace("’", "'").replace("‘", "'");
+    			}
     			output.write(str)
     			output.close();
     			parser.close();
     			inputData.close();
     		} catch (Exception ex) {
     			System.out.println(ex);
     			return false;
+    		}
     		return true;
+    	}
     	/**
     	 * The main method.
+    	 *
     	 * @param args the arguments
     	 */
     	public static void main(String[] args) {
     		String rootDir = "~/xml/rgaqcj/";
     		// new File(rootDir+"/identity/").mkdir();
     		ArrayList<String> milestones = new ArrayList<String>();// the tags who
     		// you want them
     		// to stay
     		// milestones
     		milestones.add("tagUsage");
     		milestones.add("pb");
     		milestones.add("lb");
     		milestones.add("catRef");
     		File srcfile = new File(rootDir + "anainline/", "roland-p5.xml");
     		File resultfile = new File(rootDir + "ttsrc/", "roland-p5.tt");
     		println("build ttsrc file : " + srcfile + " to : " + resultfile);
     		def builder = new BuildTTSrc(srcfile.toURL(), milestones);
     		builder.process(resultfile);
     		return;
+    	}
+    }

     // Copyright © 2010-2013 ENS de Lyon.
     // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
     // Lyon 2, University of Franche-Comté, University of Nice
     // Sophia Antipolis, University of Paris 3.
     //
     // The TXM platform is free software: you can redistribute it
     // and/or modify it under the terms of the GNU General Public
     // License as published by the Free Software Foundation,
     // either version 2 of the License, or (at your option) any
     // later version.
     //
     // The TXM platform is distributed in the hope that it will be
     // useful, but WITHOUT ANY WARRANTY; without even the implied
     // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     // PURPOSE. See the GNU General Public License for more
     // details.
     //
     // You should have received a copy of the GNU General
     // Public License along with the TXM platform. If not, see
     // http://www.gnu.org/licenses.
     //
     //
     //
     // $LastChangedDate: 2017-04-19 16:23:38 +0200 (mer. 19 avril 2017) $
     // $LastChangedRevision: 3430 $
     // $LastChangedBy: mdecorde $
     //
     package org.txm.importer.xmltxm
     import org.txm.importer.HasElement;
     import org.txm.importer.StaxIdentityParser;
     import java.text.DateFormat;
     import java.util.ArrayList;
     import java.util.Date;
     import java.util.HashMap;
     import java.util.Locale;
     import javax.xml.stream.*;
     import java.net.URL;
     import org.txm.importer.filters.*;
     import org.txm.utils.AsciiUtils;
     /**
      * The Class Xml2Ana.
+     *
      * @author mdecorde
      * transform : pre xml-tei file >> xml-tei-txm file
      * The pre xml-tei file must contains a minimal teiHeader with classDecl, encodingDesc and titleStmt
+     *
      * you must specify the correspondance between word attributs and ana types&respStmtIDs
      * then the attributes of w tags will be transformed into interp tag
      */
     public class Xml2Ana extends StaxIdentityParser
+    {
     	/** The dir. */
     	private def dir;
     	/** The convert all attributes. */
     	private boolean convertAllAttributes = false;
     	/** The corresp type. */
     	HashMap<String,String> correspType;
     	/** The corresp ref. */
     	HashMap<String,String> correspRef;
     	/** The check tags. */
     	HashMap<String,Boolean> checkTags = new HashMap<String,Boolean>();
     	/** The resp id. */
     	def respId = [];
     	/** The applications. */
     	HashMap<String,File> applications;
     	/** The taxonomies. */
     	HashMap<String,String[]> taxonomies;
     	/** The resps. */
     	HashMap<String,String[]> resps;
     	/** The items. */
     	HashMap<String,HashMap<String,String>> items;
     	/** The XML headeradded. */
     	boolean XMLHeaderadded = false;
     	String textname;
     	String wtag = "w";
     	public static final String TEXT = "text"
     	public static final String ID = "id"
     	/**
     	 * Instantiates a new xml2 ana.
+    	 *
     	 * @param url the url
     	 * @param wordprefix the wordprefix
     	 */
     	public Xml2Ana(File file) {
     		super(file.toURI().toURL());
     		//File file = new File(url.getFile()).getAbsoluteFile()
     		textname = file.getName();
     		int idx = textname.lastIndexOf(".");
     		if (idx > 0)
     			textname = textname.substring(0, idx)
     		checkTags.put("respStmt",false);
     		checkTags.put("titleStmt",false);
     		checkTags.put("appInfo",false);
     		hasText = new HasElement(file, TEXT).process();
+    	}
     	/**
     	 * Sets the convert all atrtibutes.
+    	 *
     	 * @param value the value
     	 * @return the java.lang. object
     	 */
     	public setConvertAllAtrtibutes(boolean value) {
     		convertAllAttributes = value;
+    	}
     	/**
     	 * Sets the convert all atrtibutes.
+    	 *
     	 * @param value the value
     	 * @return the java.lang. object
     	 */
     	public setWordTag(String wtag) {
     		this.wtag = wtag
+    	}
     	int idcount = 0;
     	boolean flagWord = false;
     	int firstElement = 0;
     	boolean teiElementAdded = false;
     	boolean teiHeaderElementAdded = false;
     	boolean hasText = false;
     	boolean textElementAdded = false;
     	def anabalises = [];
     	protected void processStartElement()
+    	{
     //		println "checkTags=$checkTags";
     //		println "parser=$parser";
     		firstElement++;
     		if (this.checkTags.containsKey(parser.getLocalName())) {
     			this.checkTags.put(parser.getLocalName(), true);
+    		}
     		switch (parser.getLocalName()) {
     			case wtag:
     				if (!hasText) {
     					writer.writeStartElement(TEXT);
     					writer.writeAttribute(ID, textname);
     					textElementAdded = true;
     					hasText = true;
+    				}
     				idcount++; // increment word counter
     				anabalises.clear();
     				writer.writeStartElement(parser.getLocalName()); // write w
     				for (int i = 0 ; i < parser.getNamespaceCount() ; i++) // write namespaces
     					writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
     				for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { // transform attributes
     					String type = parser.getAttributeLocalName(i);
     					String value = parser.getAttributeValue(i);
     					if (correspType.containsKey(type)) { // check if txm:ana
     						String corresptype = correspType.get(type);
     						String ref = correspRef.get(type);
     						anabalises.add(["#"+ref, "#"+corresptype, value]);
     					} else if (type == ID) { // keep id attribute
     						String wordid = value
     						if (wordid.startsWith("w")) {
     							if (!wordid.startsWith("w_"))
     								wordid = "w_"+wordid.substring(1)
+    						}
     //						else {
     //							wordid = "w_"+textname+"_"+wordid;
     //						}
     						wordid = AsciiUtils.buildAttributeId(wordid); // remove characters not compatible with the id attribute value
     						writer.writeAttribute(type, wordid);
     					} else { // add attributes that was in the original <w>
     						if (convertAllAttributes)
     							anabalises.add(["none","#"+type, value])
     						else
     							writer.writeAttribute(type, value);
+    					}
+    				}
     				flagWord = true; // start to capture the form
     				writer.writeStartElement(TXMNS, "form");
     				break;
     			case "TEI":
     				super.processStartElement();
     				boolean hasTeiNS = false;
     				boolean hasTXMNs = false;
     				for (int i = 0 ; i < parser.getNamespaceCount() ; i++) {
     					if (parser.getNamespaceURI(i) == TXMNS)
     						hasTXMNs = true;
     					else if (parser.getNamespaceURI(i) == TEINS)
     						hasTeiNS = true;
+    				}
     				if (!hasTeiNS) {
     					writer.writeDefaultNamespace(TEINS);
+    				}
     				if (!hasTXMNs)
     					writer.writeNamespace(TXM, TXMNS);
     				break;
     			default:
     				if (TEXT.equals(localname)) {
     					hasText = true;
+    				}
     				if (firstElement == 1) { // test if first element is TEI
     					//println "first tag: "+parser.getLocalName()
     					if (localname != "TEI") { // "TEI" is missing
     						teiElementAdded = true;
     						addTEIElement();
     					} else if (!hasText) {
     						writer.writeStartElement(TEXT);
     						writer.writeAttribute(ID, textname);
     						textElementAdded = true;
     						hasText = true;
+    					}
+    				}
     				if (firstElement == 2 && teiElementAdded != true) {
     					//println "second tag: "+parser.getLocalName()
     					if (localname != "teiHeader") { // teiHeader is missing
     						writeTeiHeader();
     						hasTeiHeader = true
     						teiHeaderElementAdded = true
+    					}
     				} else if (!hasText & (teiElementAdded | teiHeaderElementAdded)) {
     					writer.writeStartElement(TEXT);
     					writer.writeAttribute(ID, textname);
     					textElementAdded = true;
     					hasText = true;
+    				}
     				super.processStartElement();
     				if (TEXT.equals(localname)) {
     					if (!parser.getAttributeValue(null, ID)) {
     						writer.writeAttribute(ID, textname);
+    					}
+    				}
+    		}
+    	}
     	protected void after()
+    	{
     		if (textElementAdded) {
     			writer.writeEndElement(); // text
+    		}
     		if (teiElementAdded) {
     			writer.writeEndElement(); // TEI
+    		}
     		super.after(); // close writer, parser, etc
+    	}
     	protected void addTEIElement()
+    	{
     		writer.writeStartElement("TEI");
     		writer.writeDefaultNamespace(TEINS);
     		writer.writeNamespace(TXM, TXMNS);
     		writer.writeNamespace(TEI, TEINS);
     		writeTeiHeader();
+    	}
     	protected void processCharacters()
+    	{
     		if (flagWord) {
     			writer.writeCharacters(parser.getText().trim()); // keep form in 1 line
     		} else {
     			super.processCharacters();
+    		}
+    	}
     	boolean hasClassDecl = false;
     	boolean hasFileDesc = false;
     	boolean hasEncodingDesc = false;
     	boolean hasTeiHeader = false;
     	boolean hasTEI = false;
     	public static String ANA = "ana"
     	public static String RESP = "resp"
     	public static String TYPE = "type"
     	protected void processEndElement()
+    	{
     		switch (parser.getLocalName()) {
     			case wtag:
     				writer.writeEndElement(); // txm:form
     				for (def values : anabalises)
     				{// <txm:ana resp=ref type=corresptype>value</txm:ana>
     					writer.writeStartElement(TXMNS, ANA);
     					writer.writeAttribute(RESP, values[0]);
     					writer.writeAttribute(TYPE, values[1]);
     					writer.writeCharacters(values[2]);
     					writer.writeEndElement(); // txm:ana
+    				}
     				flagWord = false;
     				break;
     			case "fileDesc":
     				hasFileDesc = true;
     				this.writeTXMResps();
     				break;
     			case "classDecl":
     				hasClassDecl=true;
     				this.writeTXMTaxonomies();
     				break;
     			case "encodingDesc":
     				hasEncodingDesc = true;
     				writeContentOfEncodingDesc();
     				break;
     			case "teiHeader":
     				hasTeiHeader = true
     				if (!hasEncodingDesc) {
     					writer.writeStartElement("encodingDesc");
     					writeContentOfEncodingDesc();
     					writer.writeEndElement();
+    				}
     				break;
     			case "TEI":
     				hasTEI = true;
     				if (!hasTeiHeader) {
     					writeTeiHeader();
+    				}
     				break;
+    		}
     		super.processEndElement();
+    	}
     	protected void writeTeiHeader()
+    	{
     		writer.writeStartElement("teiHeader");
     		writer.writeStartElement("fileDesc")
     		this.writeTXMResps();
     		writer.writeStartElement("titleStmt")
     		writer.writeStartElement("title")
     		writer.writeEndElement(); // title
     		writer.writeEndElement(); // titleStmt
     		writer.writeStartElement("publicationStmt")
     		writer.writeEndElement(); // publicationStmt
     		writer.writeStartElement("sourceDesc")
     		writer.writeEndElement(); // sourceDesc
     		writer.writeEndElement(); // fileDesc
     		writer.writeStartElement("encodingDesc");
     		writeContentOfEncodingDesc();
     		writer.writeEndElement(); // encodingDesc
     		writer.writeEndElement(); // teiHeader
+    	}
     	protected void writeContentOfEncodingDesc()
+    	{
     		writer.writeStartElement("appInfo")
     		this.writeTXMApps();
     		writer.writeEndElement(); // appInfo
     		if (!hasClassDecl) {
     			writer.writeStartElement("classDecl");
     			this.writeTXMTaxonomies();
     			writer.writeEndElement(); // classDecl
+    		}
+    	}
     	/**
     	 * Check resp.
+    	 *
     	 * @return the string
     	 */
     	public String checkResp()
+    	{
     		String rez ="found tags : \n";
     		for (String key : checkTags.keySet())
     			rez += "\t"+key+"\n";
     		return rez;
+    	}
     	/**
     	 * Sets the correspondances.
+    	 *
     	 * @param correspRef the corresp ref
     	 * @param correspType the corresp type
     	 */
     	public void setCorrespondances(correspRef, correspType)
+    	{
     		this.correspRef = correspRef;
     		this.correspType = correspType;
+    	}
     	/**
     	 * Sets the header infos.
+    	 *
     	 * @param respId the resp id
     	 * @param resps the resps
     	 * @param applications the applications
     	 * @param taxonomies the taxonomies
     	 * @param items the items
     	 */
     	public void setHeaderInfos(respId,resps, applications, taxonomies, items)
+    	{
     		this.respId = respId
     		this.resps = resps
     		this.applications = applications
     		this.taxonomies = taxonomies;
     		this.items = items;
+    	}
     	/**
     	 * Write txm resps.
     	 */
     	public void writeTXMResps()
+    	{
     		for (String ref : respId) {
     			String[] infos = resps.get(ref);
     			writer.writeStartElement("respStmt");
     			writer.writeStartElement(RESP);
     			writer.writeAttribute(ID,ref);
     			writer.writeCharacters(infos[0]);
     			writer.writeStartElement("date");
     			writer.writeAttribute("when",infos[2]);
     			writer.writeCharacters(infos[3]);
     			writer.writeEndElement(); // date
     			writer.writeEndElement(); //resp
     			writer.writeStartElement("name");
     			writer.writeAttribute(TYPE, "person");
     			writer.writeCharacters(infos[1])
     			writer.writeEndElement(); // name
     			writer.writeEndElement(); //respStmt
+    		}
+    	}
     	/**
     	 * Write txm apps.
     	 */
     	public void writeTXMApps()
+    	{
     		for (String ref : respId) {
     			List<String> list= applications.get(ref);
     			String ident = list.get(0);
     			String version = list.get(1);
     			File report = list.get(2);
     			writer.writeStartElement(TXMNS, "application");
     			writer.writeAttribute("ident", ident);
     			writer.writeAttribute("version", version);
     			writer.writeAttribute(RESP, ref);
     			//get txm:commandLine from GeneratedReport
     			if (report != null) {
     				writer.writeCharacters("");writer.flush();
     				Reader reader = new FileReader(report);
     				String line = reader.readLine();
     				while (line != null) {
     					if (line.length() != 0)
     						output.write(line+"\n");
     					line = reader.readLine();
+    				}
     				reader.close();
+    			}
     			writer.writeStartElement("ab");
     			writer.writeAttribute(TYPE, "annotation");
     			for (String item : taxonomies.get(ref)) {
     				writer.writeStartElement("list");
     				writer.writeEmptyElement("ref");
     				writer.writeAttribute(TYPE, "tagset");
     				writer.writeAttribute("target", item);
     				writer.writeEndElement(); // list
+    			}
     			writer.writeEndElement(); // ab
     			writer.writeEndElement(); // txm:application
+    		}
+    	}
     	/**
     	 * Write txm taxonomies.
     	 */
     	public void writeTXMTaxonomies()
+    	{
     		for (String tax : items.keySet()) {
     			writer.writeStartElement("taxonomy");
     			writer.writeAttribute(ID, tax);
     			writer.writeStartElement("bibl");
     			writer.writeAttribute(TYPE, "tagset");
     			writer.writeStartElement("title");
     			writer.writeCharacters(tax);
     			writer.writeEndElement(); // title
     			for (String type : items.get(tax).keySet()) {
     				writer.writeEmptyElement("ref");
     				writer.writeAttribute(TYPE, type);
     				writer.writeAttribute("target", items.get(tax).get(type));
+    			}
     			writer.writeEndElement(); // bibl
     			writer.writeEndElement(); // taxonomy
+    		}
+    	}
     	/**
     	 * The main method.
+    	 *
     	 * @param args the arguments

... Ce différentiel a été tronqué car il excède la taille maximale pouvant être affichée.

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 881