You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

415 lines
12 KiB

package de.memtext.fileConverting;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.function.BooleanSupplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.mozilla.universalchardet.UniversalDetector;
import com.google.common.base.Utf8;
import java.util.*;
public class FileChecker {
// kann für JUnitTest abgeschaltet werden
public static boolean isSystemExitWanted = true;
private static final String CONTROLREGEX = "[^\\n\\r\\t]]&&\\p{C}]";
// private static final Pattern CONTROLPATTERN = Pattern.compile(CONTROLREGEX);
private static Pattern CONTROLPATTERN = Pattern.compile("[[^\\n\\r\\t]&&\\p{C}]");
private static final Options options = new Options();
private boolean isDryRun = false;
private boolean isSafeRun = false;
private boolean isReplaceTabsWanted = false;
private String inputEncoding = "ISO-8859-15";
private String outputEncoding = "UTF-8";
private boolean isOutputEncodingUTF8 = true;
private boolean hasErrors = false;
private Set<File> filesToCheck = new HashSet<File>();
private Map<File, ArrayList<String>> filesWithErrors = new HashMap<File, ArrayList<String>>();
private static List<String> internalLog = new LinkedList<String>();
private enum loglevels {
DEBUG, INFO, SEVERE
}
private Object loglevel = loglevels.INFO;
public String getInputEncoding() {
return inputEncoding;
}
private static void exitIfWanted() {
if (isSystemExitWanted) {
System.exit(1);
}
}
public void setInputEncoding(String inputEncoding) {
try {
Charset.forName(inputEncoding);
} catch (Exception e) {
System.out.println("Ungültiges inputEncoding: " + inputEncoding);
exitIfWanted();
}
this.inputEncoding = inputEncoding;
}
private static void initOptions() {
options.addOption("ie", "input-encoding", true, "Encoding der Eingabedatei/en (default ISO-8859-15)");
options.addOption("oe", "ouput-encoding", true, "ZielEncoding (default UTF8)");
options.addOption("l", "log", true, "debug|info (default)|severe");
options.addOption("n", "dry-run", false, "Ausführung ohne Änderungen");
options.addOption("s", "safe-run", false, "Bei Fehlern Originaldateien als .orig behalten");
options.addOption("f", "file", true, "Datei");
options.addOption("d", "dir", true, "Verzeichnis");
options.addOption("w", "wildcard", true, "Wildcard für Dateisuche in einem Verzeichnis (Default *.unl)");
options.addOption("t", "tabs-to-spaces", false, "Tabs werden durch Leerzeichen ersetzt (Default false)");
}
private static void showUsage() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("FileChecker", options);
}
public boolean isSafeRun() {
return isSafeRun;
}
public void setSafeRun(boolean isSafeRun) {
this.isSafeRun = isSafeRun;
}
public boolean isDryRun() {
return isDryRun;
}
public void setDryRun(boolean isDryRun) {
this.isDryRun = isDryRun;
}
public FileChecker() {
}
public static void main(String[] args) {
initOptions();
if (args.length == 0) {
showUsage();
} else {
CommandLineParser parser = new DefaultParser();
CommandLine cmd = null;
FileChecker fc = new FileChecker();
try {
cmd = parser.parse(options, args);
initLogLevel(fc, cmd);
if (cmd.hasOption("ie")) {
fc.setInputEncoding(cmd.getOptionValue("ie"));
}
if (cmd.hasOption("oe")) {
fc.setOutputEncoding(cmd.getOptionValue("oe"));
}
if (cmd.hasOption("n")) {
fc.setDryRun(true);
fc.logdebug("Dry Mode");
}
if (cmd.hasOption("s")) {
fc.setSafeRun(true);
fc.logdebug("Safe Mode");
}
if (cmd.hasOption("t")) {
fc.setReplaceTabsWanted(true);
}
if (!cmd.hasOption("f") && !cmd.hasOption("d")) {
System.out.println("Geben Sie als Option -f oder -d an");
showUsage();
exitIfWanted();
}
if (cmd.hasOption("f")) {
fc.addTestfile(cmd.getOptionValue("f"));
}
if (cmd.hasOption("d")) {
String wildcard = "*.unl";
if (cmd.hasOption("w")) {
wildcard = cmd.getOptionValue("w");
}
String verzeichnis = cmd.getOptionValue("d");
File dir = new File(verzeichnis);
if (dir.isDirectory()) {
SearchFileByWildcard sfbw = new SearchFileByWildcard();
fc.addSearchFiles(sfbw.searchWithWc(Paths.get(verzeichnis), "glob:" + wildcard));
} else {
System.out.println("Kein gültiges Verzeichnis: " + verzeichnis);
exitIfWanted();
}
}
fc.run();
} catch (ParseException e) {
e.printStackTrace();
exitIfWanted();
} catch (IOException e) {
e.printStackTrace();
exitIfWanted();
}
}
}
private void setReplaceTabsWanted(boolean replaceTabs) {
this.isReplaceTabsWanted = replaceTabs;
}
private void addSearchFiles(Set<File> fileset) {
filesToCheck.addAll(fileset);
}
private void run() throws IOException {
logstatus();
for (File singleFile : filesToCheck) {
check(singleFile);
}
if (hasErrors) {
loginfo("Fehler in folgenden Dateien:");
for (File f : filesWithErrors.keySet()) {
ArrayList<String> errorlist = filesWithErrors.get(f);
for (String meldung : errorlist) {
loginfo(f.getAbsolutePath() + " " + meldung);
}
}
exitIfWanted();
}
}
private void logstatus() {
internalLog.clear();
internalLog.add("dry run is " + (isDryRun ? "on" : "off"));
internalLog.add("safe run is " + (isSafeRun ? "on" : "off"));
internalLog.add("LogLevel is " + getLoglevel());
}
private static void initLogLevel(FileChecker fc, CommandLine cmd) {
if (cmd.hasOption("l")) {
String val = cmd.getOptionValue("l");
if (!val.equals("debug") && !val.equals("info") && !val.equals("severe")) {
System.out.println("ungültiger Loglevel: " + val + " erlaubt: debug info oder severe");
exitIfWanted();
}
if (cmd.getOptionValue("l").equalsIgnoreCase("debug")) {
fc.setLoglevel(loglevels.DEBUG);
}
if (cmd.getOptionValue("l").equalsIgnoreCase("info")) {
fc.setLoglevel(loglevels.INFO);
}
if (cmd.getOptionValue("l").equalsIgnoreCase("severe")) {
fc.setLoglevel(loglevels.SEVERE);
}
}
}
private String getLoglevel() {
return loglevel.toString();
}
private void setLoglevel(Object loglevel) {
this.loglevel = loglevel;
}
private void setOutputEncoding(String outputEncoding) {
this.outputEncoding = outputEncoding;
try {
Charset.forName(outputEncoding);
} catch (Exception e) {
System.out.println("Ungültiges outputEncoding: " + outputEncoding);
exitIfWanted();
}
if (outputEncoding.equalsIgnoreCase("UTF-8")) {
isOutputEncodingUTF8 = false;
}
}
private String getOutputEncoding() {
return this.outputEncoding;
}
private void addTestfile(String filename) throws IOException {
File file = new File(filename);
if (!file.exists())
throw new IOException("Datei " + filename + " existiert nicht");
this.filesToCheck.add(file);
}
private void check(File fileInList) throws IOException {
logdebug("verarbeite Datei " + fileInList);
BufferedWriter logwriter = null;
if (loglevel == loglevels.DEBUG) {
logwriter = new BufferedWriter(new FileWriter(fileInList.getAbsolutePath() + "_problems.txt"));
}
logFileEncoding(fileInList);
BufferedWriter outwriter = null;
File fileToCheck = null;
File originalFile = null;
if (isDryRun) {
fileToCheck = fileInList;
} else {
originalFile = new File(fileInList.getAbsolutePath() + ".orig");
boolean success = fileInList.renameTo(originalFile);
if (!success) {
throw new IOException("Konnte " + fileInList + " nicht umbennen nach " + originalFile);
}
fileToCheck = originalFile;
outwriter = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(fileInList.getAbsolutePath()), outputEncoding));
}
Scanner scanner = new Scanner(fileToCheck, Charset.forName(inputEncoding));
int i = 0;
boolean hasControlChar = false;
boolean isInvalidUtf = false;
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
i++;
Matcher matcher = CONTROLPATTERN.matcher(line);
while (matcher.find()) {
hasControlChar = true;
if (loglevel == loglevels.DEBUG) {
logwriter.write(
"Fehler in Zeile " + i + " Position " + (matcher.start() + 1) + " Kontrollzeichen" + "\n");
}
}
if (isOutputEncodingUTF8 && !Utf8.isWellFormed(line.getBytes())) {
isInvalidUtf = true;
noteError(logwriter, fileInList, "Fehler in Zeile " + i + " ungültig in UTF8");
}
if (outwriter != null) {
if (hasControlChar) {
line = line.replaceAll(CONTROLREGEX, "");
}
if (isReplaceTabsWanted) {
line = line.replace('\t', ' ');
}
outwriter.write(line + "\n");
}
}
// System.out.println("Anzahl kontrollierter Zeilen " + i);
scanner.close();
if (outwriter != null) {
outwriter.close();
}
if (hasControlChar) {
noteError(logwriter, fileInList, "Datei enthält nach Einlesen mit "+getInputEncoding()+" Kontrollzeichen");
}
if (!isDryRun && !isSafeRun && originalFile != null && originalFile.exists()) {
originalFile.delete();
}
if (loglevel == loglevels.DEBUG) {
logwriter.close();
if (!hasControlChar && !isInvalidUtf) {
File logfile = new File(fileInList.getAbsolutePath() + "_problems.txt");
if (logfile.exists())
logfile.delete();
}
}
}
private void logFileEncoding(File file) throws IOException {
boolean result = true;
byte[] buf = new byte[4096];
java.io.FileInputStream fis = new java.io.FileInputStream(file);
UniversalDetector detector = new UniversalDetector(null);
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
fis.close();
String encoding = detector.getDetectedCharset();
if (encoding == null) {
logdebug("Es konnte kein Encoding automatisch ermittelt werden für " + file + " gehe aus von "
+ getInputEncoding());
} else {
logdebug("Automatisch ermitteltes Encoding von " + file + " : " + encoding);
/*
* if (encoding.equals("UTF-8") && !getInputEncoding().equals("UTF-8") ||
* (getInputEncoding().equals("ISO-8859-1") ||
* getInputEncoding().equals("ISO-8859-15")) &&
* !encoding.startsWith("ISO-8859")) { result = false; noteError(null, file,
* "angegebenes InputEncoding: " + getInputEncoding() +
* " passt nicht zu automatisch ermitteltem: " + encoding); }
*/
}
detector.reset();
// return result;
}
private void logdebug(String message) {
internalLog.add(message);
if (loglevel == loglevels.DEBUG) {
System.out.println(message);
}
}
private void loginfo(String message) {
internalLog.add(message);
if (loglevel == loglevels.DEBUG || loglevel == loglevels.INFO) {
System.out.println(message);
}
}
private void noteError(BufferedWriter writer, File file, String meldung) throws IOException {
hasErrors = true;
if (!filesWithErrors.containsKey(file)) {
filesWithErrors.put(file, new ArrayList<String>());
}
ArrayList<String> errorlist = filesWithErrors.get(file);
errorlist.add(meldung);
}
public static boolean containsLog(String test) {
return internalLog.contains(test);
}
}