root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTMLIndexer.groovy @ 2473
History | View | Annotate | Download (4.6 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
//
|
22 |
//
|
23 |
//
|
24 |
// $LastChangedDate:$
|
25 |
// $LastChangedRevision:$
|
26 |
// $LastChangedBy:$
|
27 |
//
|
28 |
package org.txm.scripts.importer
|
29 |
|
30 |
import java.util.Collections; |
31 |
import java.io.File; |
32 |
import org.txm.utils.io.FileCopy; |
33 |
import org.txm.utils.*; |
34 |
import java.io.File; |
35 |
import java.util.HashMap; |
36 |
import java.util.ArrayList; |
37 |
import java.io.File; |
38 |
import javax.xml.stream.*; |
39 |
import java.net.URL; |
40 |
|
41 |
// TODO: Auto-generated Javadoc
|
42 |
/**
|
43 |
* The Class HTMLIndexer.
|
44 |
*/
|
45 |
class HTMLIndexer { |
46 |
|
47 |
/** The idxprefix. */
|
48 |
static String idxprefix = "IDX-"; |
49 |
|
50 |
/** The index. */
|
51 |
HashMap<String, ArrayList<String>> index = new HashMap<String, ArrayList<String>>(); |
52 |
|
53 |
/**
|
54 |
* Process html dir.
|
55 |
*
|
56 |
* @param htmlDirectory the html directory
|
57 |
* @return true, if successful
|
58 |
*/
|
59 |
private boolean processHTMLDir(File htmlDirectory) |
60 |
{ |
61 |
ArrayList<File> htmlfiles = DeleteDir.scanDirectory(htmlDirectory, true) |
62 |
Collections.sort(htmlfiles);
|
63 |
|
64 |
for(File htmlFile : htmlfiles)//get all indexes |
65 |
{ |
66 |
if(htmlFile.getName().endsWith(".html")) |
67 |
processHTMLFile(htmlFile); |
68 |
} |
69 |
|
70 |
ArrayList<String> tokens = new ArrayList<String>(index.keySet()); |
71 |
Collections.sort(tokens);
|
72 |
|
73 |
//fix doubles like étiquette&étiquettes
|
74 |
for(int i = 0 ; i < tokens.size() ; i++) |
75 |
{ |
76 |
String t1 = tokens.get(i);
|
77 |
String t2 = tokens.get(i+1); |
78 |
if(t1.equals(t2.substring(0, t2.length() -1))) |
79 |
{ |
80 |
tokens.remove(i+1);
|
81 |
index.get(t1).addAll(index.get(t2)); |
82 |
//i--;
|
83 |
} |
84 |
} |
85 |
|
86 |
tokens = new ArrayList<String>(index.keySet()); |
87 |
for(String token : tokens) |
88 |
{ |
89 |
println("Token: "+token);
|
90 |
println(index.get(token)); |
91 |
} |
92 |
} |
93 |
|
94 |
/**
|
95 |
* Process html file.
|
96 |
*
|
97 |
* @param htmlFile the html file
|
98 |
* @return true, if successful
|
99 |
*/
|
100 |
private boolean processHTMLFile(File htmlFile) |
101 |
{ |
102 |
|
103 |
|
104 |
String lasttoken;
|
105 |
String page;
|
106 |
|
107 |
def inputData = null; |
108 |
def factory = null; |
109 |
|
110 |
try
|
111 |
{ |
112 |
URL url = htmlFile.toURI().toURL();
|
113 |
println "process html file "+url;
|
114 |
inputData = url.openStream(); |
115 |
factory = XMLInputFactory.newInstance(); |
116 |
//factory.setXMLResolver resolver
|
117 |
//factory.setProperty(factory.IS_VALIDATING, false)
|
118 |
factory.setProperty("javax.xml.stream.supportDTD", false); |
119 |
factory.setProperty("javax.xml.stream.isReplacingEntityReferences", false); |
120 |
|
121 |
XMLStreamReader parser = factory.createXMLStreamReader(inputData); |
122 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
123 |
{ |
124 |
//println "parse"
|
125 |
switch (event)
|
126 |
{ |
127 |
case XMLStreamConstants.START_ELEMENT:
|
128 |
//println "elem "+parser.getLocalName()
|
129 |
if(parser.getLocalName() == "div") |
130 |
if(parser.getAttributeValue(null, "id") != null && parser.getAttributeValue(null, "id").startsWith("index-body")) |
131 |
page = parser.getAttributeValue(null, "id") |
132 |
|
133 |
String id = parser.getAttributeValue(null, "id"); |
134 |
if(id != null && id.startsWith(idxprefix)) |
135 |
{ |
136 |
if(!index.containsKey(lasttoken))
|
137 |
index.put(lasttoken, new ArrayList<String>()); |
138 |
index.get(lasttoken).add(htmlFile.getName()+"#"+id)
|
139 |
} |
140 |
|
141 |
break;
|
142 |
|
143 |
case XMLStreamConstants.CHARACTERS:
|
144 |
String text = parser.getText().trim();
|
145 |
if(text.length() > 0) |
146 |
{ |
147 |
def texts = text.split(" "); |
148 |
lasttoken = texts[texts.size()-1];
|
149 |
if(lasttoken.endsWith(".")) |
150 |
lasttoken = lasttoken.substring(0, lasttoken.length() -1) |
151 |
} |
152 |
} |
153 |
} |
154 |
|
155 |
} |
156 |
catch(Exception e){println("File "+htmlFile+"\n"+e); |
157 |
if (parser != null) parser.close(); |
158 |
if (inputData != null) inputData.close(); |
159 |
return false;} |
160 |
|
161 |
if (parser != null) parser.close(); |
162 |
if (inputData != null) inputData.close(); |
163 |
return true; |
164 |
} |
165 |
|
166 |
/**
|
167 |
* The main method.
|
168 |
*
|
169 |
* @param args the arguments
|
170 |
*/
|
171 |
public static void main(String[] args) |
172 |
{ |
173 |
File htmlDirectory = new File("/home/mdecorde/xml/html") |
174 |
new HTMLIndexer().processHTMLDir(htmlDirectory);
|
175 |
} |
176 |
} |