You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
415 lines
12 KiB
415 lines
12 KiB
package de.memtext.fileConverting; |
|
|
|
import java.io.BufferedWriter; |
|
import java.io.File; |
|
import java.io.FileNotFoundException; |
|
import java.io.FileOutputStream; |
|
import java.io.FileWriter; |
|
import java.io.IOException; |
|
import java.io.OutputStreamWriter; |
|
import java.nio.charset.Charset; |
|
import java.nio.file.Files; |
|
import java.nio.file.Paths; |
|
import java.nio.file.StandardCopyOption; |
|
import java.util.HashSet; |
|
import java.util.LinkedList; |
|
import java.util.List; |
|
import java.util.Scanner; |
|
import java.util.Set; |
|
import java.util.function.BooleanSupplier; |
|
import java.util.regex.Matcher; |
|
import java.util.regex.Pattern; |
|
|
|
import org.apache.commons.cli.CommandLine; |
|
import org.apache.commons.cli.CommandLineParser; |
|
import org.apache.commons.cli.DefaultParser; |
|
import org.apache.commons.cli.HelpFormatter; |
|
import org.apache.commons.cli.Options; |
|
import org.apache.commons.cli.ParseException; |
|
import org.mozilla.universalchardet.UniversalDetector; |
|
|
|
import com.google.common.base.Utf8; |
|
import java.util.*; |
|
|
|
public class FileChecker { |
|
// kann für JUnitTest abgeschaltet werden |
|
public static boolean isSystemExitWanted = true; |
|
private static final String CONTROLREGEX = "[^\\n\\r\\t]]&&\\p{C}]"; |
|
// private static final Pattern CONTROLPATTERN = Pattern.compile(CONTROLREGEX); |
|
private static Pattern CONTROLPATTERN = Pattern.compile("[[^\\n\\r\\t]&&\\p{C}]"); |
|
private static final Options options = new Options(); |
|
|
|
private boolean isDryRun = false; |
|
private boolean isSafeRun = false; |
|
private boolean isReplaceTabsWanted = false; |
|
private String inputEncoding = "ISO-8859-15"; |
|
private String outputEncoding = "UTF-8"; |
|
private boolean isOutputEncodingUTF8 = true; |
|
private boolean hasErrors = false; |
|
private Set<File> filesToCheck = new HashSet<File>(); |
|
private Map<File, ArrayList<String>> filesWithErrors = new HashMap<File, ArrayList<String>>(); |
|
|
|
private static List<String> internalLog = new LinkedList<String>(); |
|
|
|
private enum loglevels { |
|
DEBUG, INFO, SEVERE |
|
} |
|
|
|
private Object loglevel = loglevels.INFO; |
|
|
|
public String getInputEncoding() { |
|
return inputEncoding; |
|
} |
|
|
|
private static void exitIfWanted() { |
|
if (isSystemExitWanted) { |
|
System.exit(1); |
|
} |
|
} |
|
|
|
public void setInputEncoding(String inputEncoding) { |
|
try { |
|
Charset.forName(inputEncoding); |
|
} catch (Exception e) { |
|
System.out.println("Ungültiges inputEncoding: " + inputEncoding); |
|
exitIfWanted(); |
|
} |
|
this.inputEncoding = inputEncoding; |
|
} |
|
|
|
private static void initOptions() { |
|
options.addOption("ie", "input-encoding", true, "Encoding der Eingabedatei/en (default ISO-8859-15)"); |
|
options.addOption("oe", "ouput-encoding", true, "ZielEncoding (default UTF8)"); |
|
options.addOption("l", "log", true, "debug|info (default)|severe"); |
|
options.addOption("n", "dry-run", false, "Ausführung ohne Änderungen"); |
|
options.addOption("s", "safe-run", false, "Bei Fehlern Originaldateien als .orig behalten"); |
|
options.addOption("f", "file", true, "Datei"); |
|
options.addOption("d", "dir", true, "Verzeichnis"); |
|
options.addOption("w", "wildcard", true, "Wildcard für Dateisuche in einem Verzeichnis (Default *.unl)"); |
|
options.addOption("t", "tabs-to-spaces", false, "Tabs werden durch Leerzeichen ersetzt (Default false)"); |
|
|
|
} |
|
|
|
private static void showUsage() { |
|
HelpFormatter formatter = new HelpFormatter(); |
|
formatter.printHelp("FileChecker", options); |
|
} |
|
|
|
public boolean isSafeRun() { |
|
return isSafeRun; |
|
} |
|
|
|
public void setSafeRun(boolean isSafeRun) { |
|
this.isSafeRun = isSafeRun; |
|
} |
|
|
|
public boolean isDryRun() { |
|
return isDryRun; |
|
} |
|
|
|
public void setDryRun(boolean isDryRun) { |
|
this.isDryRun = isDryRun; |
|
} |
|
|
|
public FileChecker() { |
|
} |
|
|
|
public static void main(String[] args) { |
|
initOptions(); |
|
if (args.length == 0) { |
|
showUsage(); |
|
} else { |
|
CommandLineParser parser = new DefaultParser(); |
|
CommandLine cmd = null; |
|
FileChecker fc = new FileChecker(); |
|
try { |
|
cmd = parser.parse(options, args); |
|
initLogLevel(fc, cmd); |
|
if (cmd.hasOption("ie")) { |
|
fc.setInputEncoding(cmd.getOptionValue("ie")); |
|
} |
|
if (cmd.hasOption("oe")) { |
|
fc.setOutputEncoding(cmd.getOptionValue("oe")); |
|
} |
|
if (cmd.hasOption("n")) { |
|
fc.setDryRun(true); |
|
fc.logdebug("Dry Mode"); |
|
} |
|
if (cmd.hasOption("s")) { |
|
fc.setSafeRun(true); |
|
fc.logdebug("Safe Mode"); |
|
} |
|
if (cmd.hasOption("t")) { |
|
fc.setReplaceTabsWanted(true); |
|
} |
|
if (!cmd.hasOption("f") && !cmd.hasOption("d")) { |
|
System.out.println("Geben Sie als Option -f oder -d an"); |
|
showUsage(); |
|
exitIfWanted(); |
|
} |
|
if (cmd.hasOption("f")) { |
|
fc.addTestfile(cmd.getOptionValue("f")); |
|
} |
|
if (cmd.hasOption("d")) { |
|
String wildcard = "*.unl"; |
|
if (cmd.hasOption("w")) { |
|
wildcard = cmd.getOptionValue("w"); |
|
} |
|
String verzeichnis = cmd.getOptionValue("d"); |
|
File dir = new File(verzeichnis); |
|
if (dir.isDirectory()) { |
|
SearchFileByWildcard sfbw = new SearchFileByWildcard(); |
|
fc.addSearchFiles(sfbw.searchWithWc(Paths.get(verzeichnis), "glob:" + wildcard)); |
|
|
|
} else { |
|
System.out.println("Kein gültiges Verzeichnis: " + verzeichnis); |
|
exitIfWanted(); |
|
} |
|
} |
|
fc.run(); |
|
} catch (ParseException e) { |
|
e.printStackTrace(); |
|
exitIfWanted(); |
|
} catch (IOException e) { |
|
e.printStackTrace(); |
|
exitIfWanted(); |
|
} |
|
} |
|
} |
|
|
|
private void setReplaceTabsWanted(boolean replaceTabs) { |
|
this.isReplaceTabsWanted = replaceTabs; |
|
|
|
} |
|
|
|
private void addSearchFiles(Set<File> fileset) { |
|
filesToCheck.addAll(fileset); |
|
|
|
} |
|
|
|
private void run() throws IOException { |
|
logstatus(); |
|
for (File singleFile : filesToCheck) { |
|
check(singleFile); |
|
} |
|
if (hasErrors) { |
|
loginfo("Fehler in folgenden Dateien:"); |
|
for (File f : filesWithErrors.keySet()) { |
|
ArrayList<String> errorlist = filesWithErrors.get(f); |
|
for (String meldung : errorlist) { |
|
loginfo(f.getAbsolutePath() + " " + meldung); |
|
} |
|
} |
|
exitIfWanted(); |
|
} |
|
} |
|
|
|
private void logstatus() { |
|
internalLog.clear(); |
|
internalLog.add("dry run is " + (isDryRun ? "on" : "off")); |
|
internalLog.add("safe run is " + (isSafeRun ? "on" : "off")); |
|
internalLog.add("LogLevel is " + getLoglevel()); |
|
|
|
} |
|
|
|
private static void initLogLevel(FileChecker fc, CommandLine cmd) { |
|
if (cmd.hasOption("l")) { |
|
String val = cmd.getOptionValue("l"); |
|
if (!val.equals("debug") && !val.equals("info") && !val.equals("severe")) { |
|
System.out.println("ungültiger Loglevel: " + val + " erlaubt: debug info oder severe"); |
|
exitIfWanted(); |
|
} |
|
if (cmd.getOptionValue("l").equalsIgnoreCase("debug")) { |
|
fc.setLoglevel(loglevels.DEBUG); |
|
} |
|
if (cmd.getOptionValue("l").equalsIgnoreCase("info")) { |
|
fc.setLoglevel(loglevels.INFO); |
|
} |
|
if (cmd.getOptionValue("l").equalsIgnoreCase("severe")) { |
|
fc.setLoglevel(loglevels.SEVERE); |
|
} |
|
} |
|
|
|
} |
|
|
|
private String getLoglevel() { |
|
return loglevel.toString(); |
|
} |
|
|
|
private void setLoglevel(Object loglevel) { |
|
this.loglevel = loglevel; |
|
|
|
} |
|
|
|
private void setOutputEncoding(String outputEncoding) { |
|
this.outputEncoding = outputEncoding; |
|
try { |
|
Charset.forName(outputEncoding); |
|
} catch (Exception e) { |
|
System.out.println("Ungültiges outputEncoding: " + outputEncoding); |
|
exitIfWanted(); |
|
} |
|
if (outputEncoding.equalsIgnoreCase("UTF-8")) { |
|
isOutputEncodingUTF8 = false; |
|
} |
|
|
|
} |
|
|
|
private String getOutputEncoding() { |
|
return this.outputEncoding; |
|
|
|
} |
|
|
|
private void addTestfile(String filename) throws IOException { |
|
File file = new File(filename); |
|
if (!file.exists()) |
|
throw new IOException("Datei " + filename + " existiert nicht"); |
|
this.filesToCheck.add(file); |
|
|
|
} |
|
|
|
private void check(File fileInList) throws IOException { |
|
logdebug("verarbeite Datei " + fileInList); |
|
BufferedWriter logwriter = null; |
|
if (loglevel == loglevels.DEBUG) { |
|
logwriter = new BufferedWriter(new FileWriter(fileInList.getAbsolutePath() + "_problems.txt")); |
|
} |
|
logFileEncoding(fileInList); |
|
|
|
BufferedWriter outwriter = null; |
|
File fileToCheck = null; |
|
File originalFile = null; |
|
if (isDryRun) { |
|
fileToCheck = fileInList; |
|
} else { |
|
originalFile = new File(fileInList.getAbsolutePath() + ".orig"); |
|
boolean success = fileInList.renameTo(originalFile); |
|
|
|
if (!success) { |
|
throw new IOException("Konnte " + fileInList + " nicht umbennen nach " + originalFile); |
|
} |
|
|
|
fileToCheck = originalFile; |
|
outwriter = new BufferedWriter( |
|
new OutputStreamWriter(new FileOutputStream(fileInList.getAbsolutePath()), outputEncoding)); |
|
} |
|
|
|
Scanner scanner = new Scanner(fileToCheck, Charset.forName(inputEncoding)); |
|
int i = 0; |
|
boolean hasControlChar = false; |
|
boolean isInvalidUtf = false; |
|
while (scanner.hasNextLine()) { |
|
String line = scanner.nextLine(); |
|
i++; |
|
Matcher matcher = CONTROLPATTERN.matcher(line); |
|
|
|
while (matcher.find()) { |
|
hasControlChar = true; |
|
if (loglevel == loglevels.DEBUG) { |
|
logwriter.write( |
|
"Fehler in Zeile " + i + " Position " + (matcher.start() + 1) + " Kontrollzeichen" + "\n"); |
|
} |
|
|
|
} |
|
if (isOutputEncodingUTF8 && !Utf8.isWellFormed(line.getBytes())) { |
|
isInvalidUtf = true; |
|
noteError(logwriter, fileInList, "Fehler in Zeile " + i + " ungültig in UTF8"); |
|
} |
|
if (outwriter != null) { |
|
if (hasControlChar) { |
|
line = line.replaceAll(CONTROLREGEX, ""); |
|
} |
|
if (isReplaceTabsWanted) { |
|
line = line.replace('\t', ' '); |
|
} |
|
outwriter.write(line + "\n"); |
|
} |
|
} |
|
// System.out.println("Anzahl kontrollierter Zeilen " + i); |
|
scanner.close(); |
|
if (outwriter != null) { |
|
outwriter.close(); |
|
} |
|
if (hasControlChar) { |
|
noteError(logwriter, fileInList, "Datei enthält nach Einlesen mit "+getInputEncoding()+" Kontrollzeichen"); |
|
} |
|
if (!isDryRun && !isSafeRun && originalFile != null && originalFile.exists()) { |
|
originalFile.delete(); |
|
} |
|
if (loglevel == loglevels.DEBUG) { |
|
logwriter.close(); |
|
if (!hasControlChar && !isInvalidUtf) { |
|
File logfile = new File(fileInList.getAbsolutePath() + "_problems.txt"); |
|
if (logfile.exists()) |
|
logfile.delete(); |
|
} |
|
} |
|
|
|
} |
|
|
|
private void logFileEncoding(File file) throws IOException { |
|
boolean result = true; |
|
byte[] buf = new byte[4096]; |
|
java.io.FileInputStream fis = new java.io.FileInputStream(file); |
|
UniversalDetector detector = new UniversalDetector(null); |
|
int nread; |
|
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { |
|
detector.handleData(buf, 0, nread); |
|
} |
|
detector.dataEnd(); |
|
fis.close(); |
|
String encoding = detector.getDetectedCharset(); |
|
if (encoding == null) { |
|
logdebug("Es konnte kein Encoding automatisch ermittelt werden für " + file + " gehe aus von " |
|
+ getInputEncoding()); |
|
|
|
} else { |
|
logdebug("Automatisch ermitteltes Encoding von " + file + " : " + encoding); |
|
|
|
/* |
|
* if (encoding.equals("UTF-8") && !getInputEncoding().equals("UTF-8") || |
|
* (getInputEncoding().equals("ISO-8859-1") || |
|
* getInputEncoding().equals("ISO-8859-15")) && |
|
* !encoding.startsWith("ISO-8859")) { result = false; noteError(null, file, |
|
* "angegebenes InputEncoding: " + getInputEncoding() + |
|
* " passt nicht zu automatisch ermitteltem: " + encoding); } |
|
*/ |
|
|
|
} |
|
|
|
detector.reset(); |
|
// return result; |
|
} |
|
|
|
private void logdebug(String message) { |
|
internalLog.add(message); |
|
if (loglevel == loglevels.DEBUG) { |
|
System.out.println(message); |
|
|
|
} |
|
} |
|
|
|
private void loginfo(String message) { |
|
internalLog.add(message); |
|
if (loglevel == loglevels.DEBUG || loglevel == loglevels.INFO) { |
|
System.out.println(message); |
|
|
|
} |
|
} |
|
|
|
private void noteError(BufferedWriter writer, File file, String meldung) throws IOException { |
|
hasErrors = true; |
|
if (!filesWithErrors.containsKey(file)) { |
|
filesWithErrors.put(file, new ArrayList<String>()); |
|
} |
|
ArrayList<String> errorlist = filesWithErrors.get(file); |
|
errorlist.add(meldung); |
|
|
|
} |
|
|
|
public static boolean containsLog(String test) { |
|
|
|
return internalLog.contains(test); |
|
} |
|
|
|
}
|
|
|