Révision 2905
| tmp/org.txm.analec.rcp/src/org/txm/annotation/urs/commands/ImportGlozzCorpus.java (revision 2905) | ||
|---|---|---|
| 11 | 11 |
import org.eclipse.core.runtime.IProgressMonitor; |
| 12 | 12 |
import org.eclipse.core.runtime.IStatus; |
| 13 | 13 |
import org.eclipse.core.runtime.Status; |
| 14 |
import org.eclipse.jface.dialogs.MessageDialog; |
|
| 14 | 15 |
import org.eclipse.osgi.util.NLS; |
| 16 |
import org.eclipse.swt.widgets.Display; |
|
| 17 |
import org.eclipse.swt.widgets.MessageBox; |
|
| 15 | 18 |
import org.kohsuke.args4j.Option; |
| 16 | 19 |
import org.txm.Toolbox; |
| 17 | 20 |
import org.txm.annotation.urs.URSAnnotationReIndexer; |
| ... | ... | |
| 19 | 22 |
import org.txm.annotation.urs.Messages; |
| 20 | 23 |
import org.txm.objects.Project; |
| 21 | 24 |
import org.txm.rcp.handlers.scripts.ExecuteImportScript; |
| 25 |
import org.txm.rcp.messages.TXMUIMessages; |
|
| 22 | 26 |
import org.txm.rcp.swt.widget.parameters.ParametersDialog; |
| 23 | 27 |
import org.txm.rcp.utils.JobHandler; |
| 24 | 28 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
| ... | ... | |
| 33 | 37 |
import visuAnalec.fichiers.FichiersGlozz; |
| 34 | 38 |
import visuAnalec.fichiers.FichiersJava; |
| 35 | 39 |
|
| 36 |
public class ImportGlozzCorpus extends AbstractHandler {
|
|
| 37 |
|
|
| 38 |
|
|
| 39 |
|
|
| 40 |
// @Option(name="aafile",usage="an example file", widget="File", required=true, def="text.aa")
|
|
| 41 |
// File aafile;
|
|
| 42 |
|
|
| 43 |
// @Option(name="acfile",usage="an example file", widget="File", required=true, def="annotations.ac")
|
|
| 44 |
// File acfile
|
|
| 45 |
|
|
| 46 |
@Option(name="glozzDirectory",usage="A folder containing the Glozz files: aa ac and aam", widget="Folder", required=true, def="glozz")
|
|
| 40 |
public class ImportGlozzCorpus extends AbstractHandler {
|
|
| 41 |
|
|
| 42 |
|
|
| 43 |
|
|
| 44 |
// @Option(name="aafile",usage="an example file", widget="File", required=true, def="text.aa")
|
|
| 45 |
// File aafile;
|
|
| 46 |
|
|
| 47 |
// @Option(name="acfile",usage="an example file", widget="File", required=true, def="annotations.ac")
|
|
| 48 |
// File acfile
|
|
| 49 |
|
|
| 50 |
@Option(name = "glozzDirectory", usage = "A folder containing the Glozz files: aa ac and aam", widget = "Folder", required = true, def = "glozz")
|
|
| 47 | 51 |
File glozzDirectory; |
| 48 |
|
|
| 49 |
@Option(name="aamfile",usage="The aam file to use", widget="File", required=true, def="model.aam")
|
|
| 52 |
|
|
| 53 |
@Option(name = "aamfile", usage = "The aam file to use", widget = "File", required = true, def = "model.aam")
|
|
| 50 | 54 |
File aamfile; |
| 55 |
|
|
| 51 | 56 |
private String name; |
| 52 |
|
|
| 53 |
/* (non-Javadoc) |
|
| 57 |
|
|
| 58 |
/* |
|
| 59 |
* (non-Javadoc) |
|
| 54 | 60 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
| 55 | 61 |
*/ |
| 56 | 62 |
@Override |
| ... | ... | |
| 58 | 64 |
try {
|
| 59 | 65 |
// Open the parameters input dialog box |
| 60 | 66 |
if (!ParametersDialog.open(this)) return null; |
| 61 |
|
|
| 67 |
|
|
| 62 | 68 |
// END OF PARAMETERS |
| 63 | 69 |
if (!glozzDirectory.exists()) {
|
| 64 | 70 |
System.out.println(Messages.ImportGlozzCorpus_0); |
| 65 | 71 |
return null; |
| 66 | 72 |
} |
| 67 |
|
|
| 73 |
|
|
| 68 | 74 |
name = glozzDirectory.getName(); |
| 69 | 75 |
name = AsciiUtils.buildId(name); |
| 70 |
|
|
| 76 |
String newCorpusName = name.toUpperCase(); |
|
| 77 |
if (Toolbox.workspace.getProject(newCorpusName) != null) {
|
|
| 78 |
boolean b = MessageDialog.openConfirm(Display.getCurrent().getActiveShell(), TXMUIMessages.warning, NLS.bind(TXMUIMessages.theP0CorpusDirectoryAlreadyExistsDoYouWantToReplaceIt, |
|
| 79 |
newCorpusName)); |
|
| 80 |
if (!b) {
|
|
| 81 |
Log.info("Import aborted.");
|
|
| 82 |
return null; |
|
| 83 |
} |
|
| 84 |
} |
|
| 85 |
|
|
| 71 | 86 |
final File srcDir = new File(glozzDirectory, name); |
| 72 | 87 |
System.out.println(NLS.bind(Messages.ImportGlozzCorpus_1, srcDir)); |
| 73 | 88 |
DeleteDir.deleteDirectory(srcDir); |
| 74 | 89 |
srcDir.mkdir(); |
| 75 |
|
|
| 90 |
|
|
| 76 | 91 |
final File ecDir = new File(glozzDirectory, "ec"); //$NON-NLS-1$ |
| 77 | 92 |
System.out.println(NLS.bind(Messages.ImportGlozzCorpus_3, ecDir)); |
| 78 | 93 |
DeleteDir.deleteDirectory(ecDir); |
| 79 | 94 |
ecDir.mkdir(); |
| 80 |
|
|
| 81 |
|
|
| 95 |
|
|
| 96 |
|
|
| 82 | 97 |
if (!srcDir.exists()) {
|
| 83 | 98 |
System.out.println(Messages.ImportGlozzCorpus_4); |
| 84 | 99 |
return null; |
| 85 | 100 |
} |
| 86 |
|
|
| 101 |
|
|
| 87 | 102 |
if (!ecDir.exists()) {
|
| 88 | 103 |
System.out.println(Messages.ImportGlozzCorpus_5); |
| 89 | 104 |
return null; |
| 90 | 105 |
} |
| 91 |
|
|
| 106 |
|
|
| 92 | 107 |
// write the TXT file WITH paragraphs |
| 93 | 108 |
File[] aaFiles = glozzDirectory.listFiles(new FilenameFilter() {
|
| 109 |
|
|
| 94 | 110 |
@Override |
| 95 | 111 |
public boolean accept(File dir, String name) {
|
| 96 | 112 |
return name.endsWith(".aa"); //$NON-NLS-1$
|
| 97 | 113 |
} |
| 98 | 114 |
}); |
| 99 | 115 |
File[] acFiles = glozzDirectory.listFiles(new FilenameFilter() {
|
| 116 |
|
|
| 100 | 117 |
@Override |
| 101 | 118 |
public boolean accept(File dir, String name) {
|
| 102 | 119 |
return name.endsWith(".ac"); //$NON-NLS-1$
|
| 103 | 120 |
} |
| 104 | 121 |
}); |
| 105 |
|
|
| 122 |
|
|
| 106 | 123 |
if (aaFiles == null || aaFiles.length == 0) {
|
| 107 | 124 |
System.out.println(NLS.bind(Messages.ImportGlozzCorpus_8, glozzDirectory)); |
| 108 | 125 |
return false; |
| ... | ... | |
| 115 | 132 |
System.out.println(NLS.bind(Messages.ImportGlozzCorpus_10, glozzDirectory)); |
| 116 | 133 |
return false; |
| 117 | 134 |
} |
| 118 |
|
|
| 135 |
|
|
| 119 | 136 |
System.out.println(NLS.bind(Messages.ImportGlozzCorpus_11, glozzDirectory)); |
| 120 | 137 |
Arrays.sort(acFiles); |
| 121 | 138 |
Arrays.sort(aaFiles); |
| 122 |
for (int i = 0 ; i < aaFiles.length ; i++) {
|
|
| 139 |
for (int i = 0; i < aaFiles.length; i++) {
|
|
| 123 | 140 |
File acFile = acFiles[i]; |
| 124 | 141 |
File aaFile = aaFiles[i]; |
| 125 | 142 |
String name = acFile.getName(); |
| 126 | 143 |
int idx = name.indexOf("."); //$NON-NLS-1$
|
| 127 | 144 |
if (idx > 0) name = name.substring(0, idx); |
| 128 |
|
|
| 129 |
File txtFile = new File(srcDir, name+".txt"); //$NON-NLS-1$
|
|
| 130 |
File ecFile = new File(ecDir, name+".ec"); //$NON-NLS-1$
|
|
| 145 |
|
|
| 146 |
File txtFile = new File(srcDir, name + ".txt"); //$NON-NLS-1$
|
|
| 147 |
File ecFile = new File(ecDir, name + ".ec"); //$NON-NLS-1$
|
|
| 131 | 148 |
Corpus tmpAnalecCorpus = new Corpus(); // need a temporary corpus |
| 132 |
|
|
| 149 |
|
|
| 133 | 150 |
if (!FichiersGlozz.importerGlozz(tmpAnalecCorpus, acFile, aaFile)) {
|
| 134 | 151 |
System.out.println(Messages.ImportGlozzCorpus_15); |
| 135 | 152 |
return null; |
| 136 | 153 |
} |
| 137 |
|
|
| 154 |
|
|
| 138 | 155 |
final String texte = tmpAnalecCorpus.getTexte(); |
| 139 | 156 |
int debParag = 0; |
| 140 | 157 |
Integer[] finPars = tmpAnalecCorpus.getFinParagraphes(); |
| 141 | 158 |
StringBuffer newTexte = new StringBuffer(texte.length() + finPars.length); |
| 142 |
for (int iParagraph = 0 ; iParagraph < finPars.length ; iParagraph++) {
|
|
| 159 |
for (int iParagraph = 0; iParagraph < finPars.length; iParagraph++) {
|
|
| 143 | 160 |
newTexte.append(texte.substring(debParag, finPars[iParagraph])).append("\n"); //$NON-NLS-1$
|
| 144 | 161 |
debParag = finPars[iParagraph]; |
| 145 | 162 |
} |
| 146 |
|
|
| 163 |
|
|
| 147 | 164 |
IOUtils.write(txtFile, newTexte.toString()); // write the TXT file for TXM TXT import module |
| 148 | 165 |
FichiersJava.enregistrerCorpus(tmpAnalecCorpus, ecFile); // write for later |
| 149 |
|
|
| 166 |
|
|
| 150 | 167 |
if (!txtFile.exists()) {
|
| 151 | 168 |
System.out.println(Messages.ImportGlozzCorpus_17); |
| 152 | 169 |
return null; |
| 153 | 170 |
} |
| 154 | 171 |
} |
| 155 |
|
|
| 156 |
Project project = Toolbox.workspace.getProject(name.toUpperCase());
|
|
| 172 |
|
|
| 173 |
Project project = Toolbox.workspace.getProject(newCorpusName);
|
|
| 157 | 174 |
if (project != null) {
|
| 158 |
// CQPSearchEngine.getEngine().stop();
|
|
| 175 |
// CQPSearchEngine.getEngine().stop();
|
|
| 159 | 176 |
project.delete(); |
| 160 | 177 |
} |
| 161 |
project = new Project(Toolbox.workspace, name.toUpperCase());
|
|
| 162 |
project.setName(name.toUpperCase());
|
|
| 178 |
project = new Project(Toolbox.workspace, newCorpusName);
|
|
| 179 |
project.setName(newCorpusName);
|
|
| 163 | 180 |
project.setSourceDirectory(srcDir.getAbsolutePath()); |
| 164 | 181 |
project.setImportModuleName("txt");
|
| 165 | 182 |
project.getEditionDefinition("default").setBuildEdition(true);
|
| ... | ... | |
| 169 | 186 |
System.out.println(Messages.ImportGlozzCorpus_23); |
| 170 | 187 |
return null; |
| 171 | 188 |
} |
| 172 |
|
|
| 189 |
|
|
| 173 | 190 |
JobHandler job2 = new JobHandler(Messages.ImportGlozzCorpus_24) {
|
| 174 |
|
|
| 191 |
|
|
| 175 | 192 |
@Override |
| 176 | 193 |
protected IStatus run(IProgressMonitor monitor) {
|
| 177 | 194 |
try {
|
| ... | ... | |
| 180 | 197 |
System.out.println("The CQP corpus was not created. Aborting.");
|
| 181 | 198 |
return Status.CANCEL_STATUS; |
| 182 | 199 |
} |
| 200 |
corpus.compute(false); |
|
| 183 | 201 |
Corpus analecCorpus = URSCorpora.getCorpus(corpus); |
| 184 | 202 |
analecCorpus.clearAll(); // remove all : annotations, structure |
| 185 | 203 |
File[] ecFiles = ecDir.listFiles(IOUtils.HIDDENFILE_FILTER); |
| ... | ... | |
| 187 | 205 |
for (File ecFile : ecFiles) {
|
| 188 | 206 |
FichiersJava.concatener(ecFile, analecCorpus); |
| 189 | 207 |
} |
| 190 |
|
|
| 208 |
|
|
| 191 | 209 |
System.out.println(NLS.bind(Messages.ImportGlozzCorpus_25, aamfile)); |
| 192 | 210 |
if (!FichiersGlozz.importerModeleGlozz(analecCorpus, aamfile)) {
|
| 193 | 211 |
System.out.println(Messages.ImportGlozzCorpus_26); |
| 194 | 212 |
return Status.CANCEL_STATUS; |
| 195 | 213 |
} |
| 196 |
//
|
|
| 197 |
// System.out.println("Importing Glozz corpus from: "+acfile+" and "+aafile);
|
|
| 198 |
// if (!FichiersGlozz.importerGlozz(analecCorpus, acfile, aafile)) {
|
|
| 199 |
// System.out.println("Error while importing Glozz corpus.");
|
|
| 200 |
// return null;
|
|
| 201 |
// }
|
|
| 202 |
|
|
| 214 |
// |
|
| 215 |
// System.out.println("Importing Glozz corpus from: "+acfile+" and "+aafile);
|
|
| 216 |
// if (!FichiersGlozz.importerGlozz(analecCorpus, acfile, aafile)) {
|
|
| 217 |
// System.out.println("Error while importing Glozz corpus.");
|
|
| 218 |
// return null;
|
|
| 219 |
// }
|
|
| 220 |
|
|
| 203 | 221 |
System.out.println(Messages.ImportGlozzCorpus_27); |
| 204 | 222 |
URSCorpora.removeCorpus(corpus); // remove old corpus if any |
| 205 |
|
|
| 223 |
|
|
| 206 | 224 |
URSAnnotationReIndexer aari = new URSAnnotationReIndexer(corpus, analecCorpus); |
| 207 | 225 |
if (!aari.process()) {
|
| 208 | 226 |
System.out.println(Messages.ImportGlozzCorpus_28); |
| 209 | 227 |
return Status.CANCEL_STATUS; |
| 210 | 228 |
} |
| 211 |
|
|
| 229 |
|
|
| 212 | 230 |
System.out.println(Messages.ImportGlozzCorpus_29); |
| 213 | 231 |
URSCorpora.saveCorpus(analecCorpus); |
| 214 | 232 |
URSCorpora.getVue(analecCorpus).retablirVueParDefaut(); |
| ... | ... | |
| 216 | 234 |
analecCorpus.setTexte(""); // free memory //$NON-NLS-1$
|
| 217 | 235 |
DeleteDir.deleteDirectory(ecDir); // cleaning |
| 218 | 236 |
DeleteDir.deleteDirectory(srcDir); // cleaning |
| 219 |
|
|
| 237 |
|
|
| 220 | 238 |
return Status.OK_STATUS; |
| 221 |
} catch (Exception e) {
|
|
| 239 |
} |
|
| 240 |
catch (Exception e) {
|
|
| 222 | 241 |
System.out.println(NLS.bind(Messages.ImportGlozzCorpus_32, e.getLocalizedMessage())); |
| 223 | 242 |
Log.printStackTrace(e); |
| 224 | 243 |
return Status.CANCEL_STATUS; |
| 225 | 244 |
} |
| 226 | 245 |
} |
| 227 | 246 |
}; |
| 228 |
job2.startJob(true); // wait for the TXT import job to finish |
|
| 247 |
job.join(0, null); |
|
| 248 |
if (job.getResult() == Status.OK_STATUS) {
|
|
| 249 |
job2.startJob(true); // wait for the TXT import job to finish |
|
| 250 |
} |
|
| 251 |
else {
|
|
| 252 |
Log.warning("Export could not be finished since the corpus import failed.");
|
|
| 253 |
} |
|
| 229 | 254 |
return null; |
| 230 |
} catch (Exception e) {
|
|
| 255 |
} |
|
| 256 |
catch (Exception e) {
|
|
| 231 | 257 |
// TODO Auto-generated catch block |
| 232 | 258 |
e.printStackTrace(); |
| 233 | 259 |
} |
| tmp/org.txm.analec.rcp/src/org/txm/annotation/urs/commands/ExportGlozzCorpus.java (revision 2905) | ||
|---|---|---|
| 21 | 21 |
import org.txm.searchengine.cqp.serverException.CqiServerError; |
| 22 | 22 |
import org.txm.utils.logger.Log; |
| 23 | 23 |
|
| 24 |
public class ExportGlozzCorpus extends AbstractHandler {
|
|
| 25 |
|
|
| 26 |
@Option(name="exportDirectory",usage="the result directory ", widget="Folder", required=true, def="result directory")
|
|
| 24 |
public class ExportGlozzCorpus extends AbstractHandler {
|
|
| 25 |
|
|
| 26 |
@Option(name = "exportDirectory", usage = "the result directory ", widget = "Folder", required = true, def = "result directory")
|
|
| 27 | 27 |
File exportDirectory; |
| 28 |
@Option(name="unit_type",usage="A unit type to export", widget="String", required=true, def="MENTION") |
|
| 28 |
|
|
| 29 |
@Option(name = "unit_type", usage = "A unit type to export", widget = "String", required = true, def = "MENTION") |
|
| 29 | 30 |
String unit_type; |
| 30 |
|
|
| 31 |
/* (non-Javadoc) |
|
| 31 |
|
|
| 32 |
/* |
|
| 33 |
* (non-Javadoc) |
|
| 32 | 34 |
* @see org.eclipse.core.commands.AbstractHandler#execute(org.eclipse.core.commands.ExecutionEvent) |
| 33 | 35 |
*/ |
| 34 | 36 |
@Override |
| ... | ... | |
| 36 | 38 |
try {
|
| 37 | 39 |
// Open the parameters input dialog box |
| 38 | 40 |
if (!ParametersDialog.open(this)) return null; |
| 39 |
|
|
| 41 |
|
|
| 40 | 42 |
Object first = CorporaView.getFirstSelectedObject(); |
| 41 | 43 |
if (!(first instanceof org.txm.searchengine.cqp.corpus.CQPCorpus)) {
|
| 42 | 44 |
System.out.println(NLS.bind(Messages.ExportGlozzCorpus_0, first)); |
| 43 | 45 |
return null; |
| 44 | 46 |
} |
| 45 |
|
|
| 46 |
final MainCorpus mainCorpus = ((org.txm.searchengine.cqp.corpus.CQPCorpus)first).getMainCorpus(); |
|
| 47 |
|
|
| 47 |
|
|
| 48 |
final MainCorpus mainCorpus = ((org.txm.searchengine.cqp.corpus.CQPCorpus) first).getMainCorpus();
|
|
| 49 |
|
|
| 48 | 50 |
JobHandler job = new JobHandler(NLS.bind(Messages.ExportGlozzCorpus_1, mainCorpus.getName())) {
|
| 49 |
|
|
| 51 |
|
|
| 50 | 52 |
@Override |
| 51 | 53 |
protected IStatus run(IProgressMonitor monitor) {
|
| 52 | 54 |
this.runInit(monitor); |
| 53 | 55 |
try {
|
| 56 |
if (exportDirectory == null) {
|
|
| 57 |
Log.warning("No export directory set. aBorting");
|
|
| 58 |
return Status.CANCEL_STATUS; |
|
| 59 |
} |
|
| 60 |
exportDirectory.mkdirs(); |
|
| 61 |
|
|
| 54 | 62 |
export(exportDirectory, mainCorpus, unit_type); |
| 55 |
} catch (ThreadDeath e) {
|
|
| 63 |
} |
|
| 64 |
catch (ThreadDeath e) {
|
|
| 56 | 65 |
System.out.println(Messages.ExportGlozzCorpus_2); |
| 57 |
} catch (Throwable e) {
|
|
| 66 |
} |
|
| 67 |
catch (Throwable e) {
|
|
| 58 | 68 |
System.out.println(NLS.bind(Messages.ExportGlozzCorpus_3, e.getLocalizedMessage())); |
| 59 | 69 |
Log.printStackTrace(e); |
| 60 | 70 |
return Status.CANCEL_STATUS; |
| ... | ... | |
| 63 | 73 |
} |
| 64 | 74 |
}; |
| 65 | 75 |
job.schedule(); |
| 66 |
|
|
| 76 |
|
|
| 67 | 77 |
return null; |
| 68 |
} catch (Throwable e) {
|
|
| 78 |
} |
|
| 79 |
catch (Throwable e) {
|
|
| 69 | 80 |
System.out.println(NLS.bind(Messages.ExportGlozzCorpus_3, e.getLocalizedMessage())); |
| 70 | 81 |
Log.printStackTrace(e); |
| 71 | 82 |
} |
| tmp/org.txm.analec.rcp/src/org/txm/annotation/urs/URSAnnotationReIndexer.java (revision 2905) | ||
|---|---|---|
| 26 | 26 |
* |
| 27 | 27 |
*/ |
| 28 | 28 |
public class URSAnnotationReIndexer {
|
| 29 |
|
|
| 29 | 30 |
MainCorpus corpus; |
| 31 |
|
|
| 30 | 32 |
Corpus analecCorpus; |
| 33 |
|
|
| 31 | 34 |
File aafile; |
| 35 |
|
|
| 32 | 36 |
public boolean debug = false; |
| 33 |
|
|
| 37 |
|
|
| 34 | 38 |
public URSAnnotationReIndexer(MainCorpus corpus, Corpus analecCorpus) {
|
| 35 | 39 |
this.corpus = corpus; |
| 36 | 40 |
this.analecCorpus = analecCorpus; |
| 37 | 41 |
} |
| 38 |
|
|
| 42 |
|
|
| 39 | 43 |
public boolean process() throws CqiClientException, IOException, CqiServerError {
|
| 40 |
|
|
| 44 |
|
|
| 41 | 45 |
int corpusSize = corpus.getSize(); |
| 42 |
|
|
| 46 |
|
|
| 43 | 47 |
String text = analecCorpus.getTexte(); |
| 44 | 48 |
int isearch = 0; // the current search start position |
| 45 |
|
|
| 49 |
|
|
| 46 | 50 |
AbstractCqiClient CQI = CQPSearchEngine.getCqiClient(); |
| 47 | 51 |
Property word = corpus.getProperty("word");
|
| 48 |
|
|
| 52 |
|
|
| 49 | 53 |
int positions[] = new int[corpusSize]; |
| 50 |
for (int i = 0 ; i < corpusSize ; i++) positions[i] = i; |
|
| 54 |
for (int i = 0; i < corpusSize; i++) |
|
| 55 |
positions[i] = i; |
|
| 51 | 56 |
|
| 52 | 57 |
int positionsCorrespondances[] = new int[corpusSize]; |
| 53 | 58 |
String strs[] = CQI.cpos2Str(word.getQualifiedName(), positions); |
| 54 |
|
|
| 55 |
for (int i = 0 ; i < corpusSize ; i++) {
|
|
| 59 |
|
|
| 60 |
for (int i = 0; i < corpusSize; i++) {
|
|
| 56 | 61 |
int idx = text.indexOf(strs[i], isearch); |
| 57 | 62 |
if (idx < 0) {
|
| 58 |
System.out.println("Error: cannot find word='"+strs[i]+"' (word with position in CQP corpus="+positions[i]+") in text with current carret="+isearch+". Aborting.");
|
|
| 63 |
System.out.println("Error: cannot find word='" + strs[i] + "' (word with position in CQP corpus=" + positions[i] + ") in text with current carret=" + isearch + ". Aborting.");
|
|
| 59 | 64 |
|
| 60 | 65 |
System.out.println("Current text slice is (-20, +20 characters): ");
|
| 61 |
System.out.println("* before: "+text.substring(Math.max(0, isearch-20), Math.min(isearch+20, isearch)));
|
|
| 62 |
System.out.println("* after: "+text.substring(isearch, Math.min(isearch+20, text.length()-1)));
|
|
| 66 |
System.out.println("* before: " + text.substring(Math.max(0, isearch - 20), Math.min(isearch + 20, isearch)));
|
|
| 67 |
System.out.println("* after: " + text.substring(isearch, Math.min(isearch + 20, text.length() - 1)));
|
|
| 63 | 68 |
|
| 64 | 69 |
return false; |
| 65 | 70 |
} |
| ... | ... | |
| 69 | 74 |
|
| 70 | 75 |
if (debug) System.out.println(Arrays.toString(positions)); |
| 71 | 76 |
if (debug) System.out.println(Arrays.toString(positionsCorrespondances)); |
| 72 |
|
|
| 77 |
|
|
| 73 | 78 |
isearch = 0; |
| 74 | 79 |
ArrayList<Unite> unites = analecCorpus.getToutesUnites(); |
| 75 | 80 |
if (debug) System.out.println("units not-sorted: ");
|
| 76 | 81 |
if (debug) printUnits(unites); |
| 77 | 82 |
Collections.sort(unites, new Comparator<Unite>() {
|
| 83 |
|
|
| 78 | 84 |
@Override |
| 79 | 85 |
public int compare(Unite o1, Unite o2) {
|
| 80 |
return o1.getDeb() - o2.getDeb(); |
|
| 86 |
int d = o1.getDeb() - o2.getDeb(); |
|
| 87 |
if (d == 0) return o1.getFin() - o2.getFin(); |
|
| 88 |
else return d; |
|
| 81 | 89 |
} |
| 82 | 90 |
}); |
| 83 | 91 |
if (debug) System.out.println("units sorted: ");
|
| ... | ... | |
| 93 | 101 |
boolean startFound = false; |
| 94 | 102 |
boolean endFound = false; |
| 95 | 103 |
int i = 0; // or not : unites are sorted by start position, we don't need to browse all words \o/ |
| 96 |
for (; i < positionsCorrespondances.length ; i++) {
|
|
| 104 |
for (; i < positionsCorrespondances.length; i++) {
|
|
| 97 | 105 |
if (startFound && endFound) break; // no need to go further |
| 98 |
if (debug) System.out.println("i="+i+" positionsCorrespondances[i]="+positionsCorrespondances[i]);
|
|
| 106 |
if (debug) System.out.println("i=" + i + " positionsCorrespondances[i]=" + positionsCorrespondances[i]);
|
|
| 99 | 107 |
if (!startFound && start < positionsCorrespondances[i]) {
|
| 100 |
unite.setDeb(i-1);
|
|
| 108 |
unite.setDeb(i - 1);
|
|
| 101 | 109 |
startFound = true; |
| 102 | 110 |
} |
| 103 | 111 |
if (!endFound && end <= positionsCorrespondances[i]) {
|
| 104 |
unite.setFin(i-1);
|
|
| 112 |
unite.setFin(i - 1);
|
|
| 105 | 113 |
endFound = true; |
| 106 | 114 |
} |
| 107 | 115 |
} |
| 108 | 116 |
|
| 109 | 117 |
if (!endFound && i == positionsCorrespondances.length) {
|
| 110 |
unite.setFin(i-1);
|
|
| 118 |
unite.setFin(i - 1);
|
|
| 111 | 119 |
endFound = true; |
| 112 | 120 |
} |
| 113 | 121 |
if (!startFound && i == positionsCorrespondances.length) {
|
| 114 |
unite.setDeb(i-1);
|
|
| 122 |
unite.setDeb(i - 1);
|
|
| 115 | 123 |
startFound = true; |
| 116 | 124 |
} |
| 117 | 125 |
|
| 118 | 126 |
if (!(startFound && endFound)) {
|
| 119 |
String s = "Error: cannot find words positions for unite of type="+unite.getType()+" and unit carret positions=["+start+", "+end+"]. Aborting";
|
|
| 127 |
String s = "Error: cannot find words positions for unite of type=" + unite.getType() + " and unit carret positions=[" + start + ", " + end + "]. Aborting";
|
|
| 120 | 128 |
System.out.println(s); |
| 121 | 129 |
|
| 122 | 130 |
System.out.println("5 last found units are: ");
|
| 123 |
for (int j = 4 ; j >= 0 ; j--) {
|
|
| 124 |
if (iunite-j >= 0) {
|
|
| 125 |
printUnite(unites.get(iunite-j));
|
|
| 131 |
for (int j = 4; j >= 0; j--) {
|
|
| 132 |
if (iunite - j >= 0) {
|
|
| 133 |
printUnite(unites.get(iunite - j));
|
|
| 126 | 134 |
} |
| 127 | 135 |
} |
| 128 | 136 |
return false; |
| 129 | 137 |
} |
| 130 |
//if (i > 0) i--; // restart at previous word |
|
| 138 |
// if (i > 0) i--; // restart at previous word
|
|
| 131 | 139 |
iunite++; |
| 132 | 140 |
} |
| 133 | 141 |
if (debug) System.out.println("units updated: ");
|
| ... | ... | |
| 136 | 144 |
} |
| 137 | 145 |
|
| 138 | 146 |
public static void printUnite(Unite unite) {
|
| 139 |
System.out.print(unite.getType()+ "["+unite.getDeb()+", "+unite.getFin()+"]");
|
|
| 147 |
System.out.print(unite.getType() + "[" + unite.getDeb() + ", " + unite.getFin() + "]");
|
|
| 140 | 148 |
} |
| 141 | 149 |
|
| 142 | 150 |
public static void printUnits(List<Unite> units) {
|
Formats disponibles : Unified diff