diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java index 00725f4685a..93cdd0b9edc 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java @@ -23,8 +23,10 @@ */ package org.opengrok.indexer.analysis; +import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; @@ -47,6 +49,7 @@ import java.util.TreeSet; import java.util.logging.Level; import java.util.logging.Logger; + import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -67,6 +70,7 @@ import org.opengrok.indexer.analysis.c.CxxAnalyzerFactory; import org.opengrok.indexer.analysis.clojure.ClojureAnalyzerFactory; import org.opengrok.indexer.analysis.csharp.CSharpAnalyzerFactory; +import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory; import org.opengrok.indexer.analysis.data.IgnorantAnalyzerFactory; import org.opengrok.indexer.analysis.data.ImageAnalyzerFactory; import org.opengrok.indexer.analysis.document.MandocAnalyzerFactory; @@ -129,6 +133,13 @@ */ public class AnalyzerGuru { + /** + * A value used as a placeholder for a filename when content is anonymous + * (e.g. from temporary source or from a stream for which an identifier is + * not available). + */ + public static final String ANONYMOUS_NAME = ""; + /** * The maximum number of characters (multi-byte if a BOM is identified) to * read from the input stream to be used for magic string matching. @@ -244,6 +255,8 @@ public class AnalyzerGuru { private static final LangTreeMap langMap = new LangTreeMap(); private static final LangTreeMap defaultLangMap = new LangTreeMap(); + private static String hugeTextFileTypeName; + /* * If you write your own analyzer please register it here. The order is * important for any factory that uses a FileAnalyzerFactory.Matcher @@ -303,7 +316,8 @@ public class AnalyzerGuru { new AsmAnalyzerFactory(), new HCLAnalyzerFactory(), new TerraformAnalyzerFactory(), - new RAnalyzerFactory() + new RAnalyzerFactory(), + HugeTextAnalyzerFactory.DEFAULT_INSTANCE }; for (AnalyzerFactory analyzer : analyzers) { @@ -389,10 +403,25 @@ public static Map getfileTypeDescriptions() { return Collections.unmodifiableMap(fileTypeDescriptions); } - public List getAnalyzerFactories() { + public static List getAnalyzerFactories() { return Collections.unmodifiableList(factories); } + /** + * Gets the normalized name of the + * {@link org.opengrok.indexer.analysis.data.HugeTextAnalyzer} class. + * @return a defined instance + */ + public static String getHugeTextFileTypeName() { + if (hugeTextFileTypeName == null) { + String newValue = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer(). + getFileTypeName(); + hugeTextFileTypeName = newValue; + return newValue; + } + return hugeTextFileTypeName; + } + /** * Register a {@code FileAnalyzerFactory} instance. */ @@ -532,29 +561,92 @@ public static AbstractAnalyzer getAnalyzer(String fileTypeName) { } /** - * Get an analyzer suited to analyze a file. This function will reuse - * analyzers since they are costly. + * Gets an analyzer factory suited to analyze a file, but without a check + * for Huge Text since the file size is not available. * * @param in Input stream containing data to be analyzed - * @param file Name of the file to be analyzed - * @return An analyzer suited for that file content + * @param fileName Name of the file to be analyzed + * @return An analyzer factory suited for that file content * @throws java.io.IOException If an error occurs while accessing the data * in the input stream. */ - public static AbstractAnalyzer getAnalyzer(InputStream in, String file) throws IOException { - AnalyzerFactory factory = find(in, file); + public static AnalyzerFactory getAnalyzerFactory(InputStream in, String fileName) + throws IOException { + AnalyzerFactory factory = find(in, fileName); if (factory == null) { - AbstractAnalyzer defaultAnalyzer = getAnalyzer(); + factory = DEFAULT_ANALYZER_FACTORY; if (LOGGER.isLoggable(Level.FINEST)) { + AbstractAnalyzer defaultAnalyzer = factory.getAnalyzer(); LOGGER.log(Level.FINEST, "{0}: fallback {1}", - new Object[]{file, - defaultAnalyzer.getClass().getSimpleName() }); + new Object[]{fileName, defaultAnalyzer.getClass().getSimpleName()}); } - return defaultAnalyzer; } + return factory; + } + + /** + * Gets an analyzer suited to analyze a file, but without a check for Huge + * Text since the file size is not available. + * + * @param in Input stream containing data to be analyzed + * @param fileName Name of the file to be analyzed + * @return An analyzer factory suited for the file content + * @throws java.io.IOException If an error occurs while accessing the data + * in the input stream. + */ + public static AbstractAnalyzer getAnalyzer(InputStream in, String fileName) + throws IOException { + AnalyzerFactory factory = getAnalyzerFactory(in, fileName); return factory.getAnalyzer(); } + /** + * Gets an analyzer factory suited to analyze a file, with a check for Huge + * Text. + * + * @param file a defined instance to be analyzed + * @param path Name (possibly normalized) of the file to be analyzed + * @param logHugeText a value indicating whether to log if the file is + * identified as Huge Text + * @return An analyzer factory suited for the file content + * @throws java.io.IOException If an error occurs while reading the file + */ + public static AnalyzerFactory getAnalyzerFactory(File file, String path, boolean logHugeText) + throws IOException { + + AnalyzerFactory fac; + try (InputStream in = new BufferedInputStream( + new FileInputStream(file))) { + fac = AnalyzerGuru.getAnalyzerFactory(in, path); + } + + if (AbstractAnalyzer.Genre.PLAIN.equals(fac.getGenre()) && + file.length() >= RuntimeEnvironment.getInstance().getHugeTextThresholdBytes()) { + String origFileTypeName = fac.getAnalyzer().getFileTypeName(); + fac = HugeTextAnalyzerFactory.DEFAULT_INSTANCE; + if (logHugeText && LOGGER.isLoggable(Level.WARNING)) { + LOGGER.log(Level.WARNING, "{0} is huge text: {1}", + new Object[]{origFileTypeName, path}); + } + } + return fac; + } + + /** + * Get an analyzer suited to analyze a file, with a check for Huge Text. + * + * @param file a defined instance to be analyzed + * @param path Name (possibly normalized) of the file to be analyzed + * @param logHugeText a value indicating whether to log if the file is + * identified as Huge Text + * @return An analyzer suited for the file content + * @throws java.io.IOException If an error occurs while reading the file + */ + public static AbstractAnalyzer getAnalyzer(File file, String path, boolean logHugeText) + throws IOException { + return getAnalyzerFactory(file, path, logHugeText).getAnalyzer(); + } + /** * Free resources associated with all registered analyzers. */ @@ -575,9 +667,8 @@ public static void returnAnalyzers() { * @throws IOException If an exception occurs while collecting the data * @throws InterruptedException if a timeout occurs */ - public void populateDocument(Document doc, File file, String path, - AbstractAnalyzer fa, Writer xrefOut) throws IOException, - InterruptedException { + public static void populateDocument(Document doc, File file, String path, + AbstractAnalyzer fa, Writer xrefOut) throws IOException, InterruptedException { String date = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND); @@ -626,7 +717,7 @@ public void populateDocument(Document doc, File file, String path, if (fa != null) { AbstractAnalyzer.Genre g = fa.getGenre(); - if (g == AbstractAnalyzer.Genre.PLAIN || g == AbstractAnalyzer.Genre.XREFABLE || g == AbstractAnalyzer.Genre.HTML) { + if (g != null) { doc.add(new Field(QueryBuilder.T, g.typeName(), string_ft_stored_nanalyzed_norms)); } fa.analyze(doc, StreamSource.fromFile(file), xrefOut); @@ -700,24 +791,36 @@ public static void writeDumpedXref(String contextPath, } /** - * Get the genre of a file. + * Get the genre of a file, with a check for Huge Text. * * @param file The file to inspect + * @param fileName name of the file to inspect * @return The genre suitable to decide how to display the file */ - public static AbstractAnalyzer.Genre getGenre(String file) { - return getGenre(find(file)); + public static AbstractAnalyzer.Genre getGenre(File file, String fileName) { + try { + return getGenre(getAnalyzerFactory(file, fileName, true)); + } catch (IOException e) { + LOGGER.log(Level.WARNING, "Error reading {0}", fileName); + return null; + } } /** - * Get the genre of a bulk of data. + * Get the genre of a bulk of data, but without a check for Huge Text since + * the file size is not available. * * @param in A stream containing the data + * @param fileName name of the file to inspect * @return The genre suitable to decide how to display the file - * @throws java.io.IOException If an error occurs while getting the content */ - public static AbstractAnalyzer.Genre getGenre(InputStream in) throws IOException { - return getGenre(find(in)); + public static AbstractAnalyzer.Genre getGenre(InputStream in, String fileName) { + try { + return getGenre(getAnalyzerFactory(in, fileName)); + } catch (IOException e) { + LOGGER.log(Level.WARNING, "Error reading {0}", fileName); + return null; + } } /** @@ -863,13 +966,12 @@ private static AnalyzerFactory findFactory(Class factoryClass) * * * @param in The input stream containing the data - * @param file The file name to get the analyzer for + * @param fileName The file name to get the analyzer for * @return the analyzer factory to use * @throws java.io.IOException If a problem occurs while reading the data */ - public static AnalyzerFactory find(InputStream in, String file) - throws IOException { - AnalyzerFactory factory = find(file); + static AnalyzerFactory find(InputStream in, String fileName) throws IOException { + AnalyzerFactory factory = find(fileName); // TODO above is not that great, since if 2 analyzers share one extension // then only the first one registered will own it // it would be cool if above could return more analyzers and below would @@ -877,17 +979,23 @@ public static AnalyzerFactory find(InputStream in, String file) if (factory != null) { return factory; } - return findForStream(in, file); + return findForStream(in, fileName); } /** - * Finds a suitable analyser class for file name. + * Finds a suitable analyser class for {@code fileName}, which should only + * be used in rare situations, such as for a JAR member or when content is + * not available to support a full determination. + *

To clarify, a full determination as done by + * {@link #getAnalyzerFactory(File, String, boolean)} also reads a bit of + * content as well as inspects file length to determine the ultimate + * analyser. * - * @param file The file name to get the analyzer for + * @param fileName The file name to get the analyzer for * @return the analyzer factory to use */ - public static AnalyzerFactory find(String file) { - String path = file; + public static AnalyzerFactory find(String fileName) { + String path = fileName; int i; // Get basename of the file first. @@ -906,8 +1014,7 @@ public static AnalyzerFactory find(String file) { if (factory != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "{0}: chosen by prefix: {1}", - new Object[]{file, - factory.getClass().getSimpleName() }); + new Object[]{fileName, factory.getClass().getSimpleName()}); } return factory; } @@ -920,8 +1027,7 @@ public static AnalyzerFactory find(String file) { if (factory != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "{0}: chosen by suffix: {1}", - new Object[]{file, - factory.getClass().getSimpleName() }); + new Object[]{fileName, factory.getClass().getSimpleName()}); } return factory; } @@ -939,8 +1045,8 @@ public static AnalyzerFactory find(String file) { * @throws java.io.IOException if an error occurs while reading data from * the stream */ - public static AnalyzerFactory find(InputStream in) throws IOException { - return findForStream(in, ""); + static AnalyzerFactory find(InputStream in) throws IOException { + return findForStream(in, ANONYMOUS_NAME); } /** @@ -948,13 +1054,13 @@ public static AnalyzerFactory find(InputStream in) throws IOException { * corresponding to a file of the specified name. * * @param in The stream containing the data to analyze - * @param file The file name to get the analyzer for + * @param fileName The file name to get the analyzer for * @return the analyzer factory to use * @throws java.io.IOException if an error occurs while reading data from * the stream */ - private static AnalyzerFactory findForStream(InputStream in, - String file) throws IOException { + private static AnalyzerFactory findForStream(InputStream in, String fileName) + throws IOException { in.mark(MAGIC_BYTES_NUM); byte[] content = new byte[MAGIC_BYTES_NUM]; @@ -980,8 +1086,8 @@ private static AnalyzerFactory findForStream(InputStream in, if (fac != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, - "{0}: chosen by precise magic: {1}", new Object[]{ - file, fac.getClass().getSimpleName() }); + "{0}: chosen by precise magic: {1}", + new Object[]{fileName, fac.getClass().getSimpleName()}); } return fac; } @@ -990,7 +1096,7 @@ private static AnalyzerFactory findForStream(InputStream in, // Next, look for magic strings String opening = readOpening(in, content); - fac = findMagicString(opening, file); + fac = findMagicString(opening, fileName); if (fac != null) { return fac; } @@ -1002,9 +1108,8 @@ private static AnalyzerFactory findForStream(InputStream in, if (fac != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, - "{0}: chosen by imprecise magic: {1}", - new Object[]{file, - fac.getClass().getSimpleName() }); + "{0}: chosen by imprecise magic: {1}", + new Object[]{fileName, fac.getClass().getSimpleName()}); } return fac; } @@ -1014,7 +1119,7 @@ private static AnalyzerFactory findForStream(InputStream in, return null; } - private static AnalyzerFactory findMagicString(String opening, String file) { + private static AnalyzerFactory findMagicString(String opening, String fileName) { // first, try to look up two words in magics String fragment = getWords(opening, 2); @@ -1022,8 +1127,7 @@ private static AnalyzerFactory findMagicString(String opening, String file) { if (fac != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}", - new Object[]{file, fac.getClass().getSimpleName(), - fragment}); + new Object[]{fileName, fac.getClass().getSimpleName(), fragment}); } return fac; } @@ -1034,8 +1138,7 @@ private static AnalyzerFactory findMagicString(String opening, String file) { if (fac != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}", - new Object[]{file, fac.getClass().getSimpleName(), - fragment}); + new Object[]{fileName, fac.getClass().getSimpleName(), fragment}); } return fac; } @@ -1048,8 +1151,8 @@ private static AnalyzerFactory findMagicString(String opening, String file) { fac = entry.getValue(); if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, - "{0}: chosen by magic(substr) {2}: {1}", new Object[]{ - file, fac.getClass().getSimpleName(), magic}); + "{0}: chosen by magic(substr) {2}: {1}", + new Object[]{fileName, fac.getClass().getSimpleName(), magic}); } return fac; } @@ -1190,4 +1293,8 @@ private static boolean factoriesDifferent(AnalyzerFactory a, } return a_name == null || !a_name.equals(b_name); } + + /* private to enforce static */ + private AnalyzerGuru() { + } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java index 7b558656a7a..ccead09f4b6 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java @@ -59,6 +59,7 @@ public class FileAnalyzer extends AbstractAnalyzer { private static final Logger LOGGER = LoggerFactory.getLogger(FileAnalyzer.class); + private static final String ANALYZER_LC = "analyzer"; /** * @return {@code null} as there is no aligned language @@ -134,10 +135,9 @@ protected FileAnalyzer(AnalyzerFactory factory, @Override public String getFileTypeName() { String name = this.getClass().getSimpleName().toLowerCase(Locale.ROOT); - String suffix = "analyzer"; - if (name.endsWith(suffix)) { - return name.substring(0, name.length() - suffix.length()); + if (name.endsWith(ANALYZER_LC)) { + return name.substring(0, name.length() - ANALYZER_LC.length()); } return name; diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java index 7017224f715..535b83979da 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2013, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2018, Chris Fraire . + * Portions Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis; @@ -52,6 +52,11 @@ public abstract class StreamSource { */ public abstract InputStream getStream() throws IOException; + /** + * Gets a reportable identifier of the source. + */ + public abstract String getSourceIdentifier(); + /** * Helper method that creates a {@code StreamSource} instance that * reads data from a file. @@ -65,6 +70,11 @@ public static StreamSource fromFile(final File file) { public InputStream getStream() throws IOException { return new BufferedInputStream(new FileInputStream(file)); } + + @Override + public String getSourceIdentifier() { + return file.getAbsolutePath(); + } }; } @@ -82,6 +92,11 @@ public static StreamSource fromString(final String str) { public InputStream getStream() throws IOException { return new ByteArrayInputStream(sbuf); } + + @Override + public String getSourceIdentifier() { + return "String"; + } }; } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java index c07e392afda..9ce77b11e20 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java @@ -27,14 +27,16 @@ import java.io.IOException; import java.io.InputStream; import java.io.Writer; +import java.util.logging.Level; +import java.util.logging.Logger; + import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.tools.bzip2.CBZip2InputStream; import org.opengrok.indexer.analysis.AbstractAnalyzer; import org.opengrok.indexer.analysis.AnalyzerFactory; import org.opengrok.indexer.analysis.AnalyzerGuru; -import org.opengrok.indexer.analysis.FileAnalyzer; import org.opengrok.indexer.analysis.StreamSource; +import org.opengrok.indexer.logger.LoggerFactory; import org.opengrok.indexer.search.QueryBuilder; /** @@ -43,17 +45,9 @@ * Created on September 22, 2005 * @author Chandan */ -public class BZip2Analyzer extends FileAnalyzer { - - private Genre g; +public class BZip2Analyzer extends CompressedAnalyzer { - @Override - public Genre getGenre() { - if (g != null) { - return g; - } - return super.getGenre(); - } + private static final Logger LOGGER = LoggerFactory.getLogger(BZip2Analyzer.class); protected BZip2Analyzer(AnalyzerFactory factory) { super(factory); @@ -71,11 +65,11 @@ public String getCtagsLang() { * Gets a version number to be used to tag processed documents so that * re-analysis can be re-done later if a stored version number is different * from the current implementation. - * @return 20180111_00 + * @return 20200417_00 */ @Override protected int getSpecializedVersionNo() { - return 20180111_00; // Edit comment above too! + return 20200417_00; // Edit comment above too! } @Override @@ -92,20 +86,12 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut) try (InputStream in = bzSrc.getStream()) { fa = AnalyzerGuru.getAnalyzer(in, newname); } - if (!(fa instanceof BZip2Analyzer)) { - if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) { - this.g = Genre.XREFABLE; - } else { - this.g = Genre.DATA; - } - fa.analyze(doc, bzSrc, xrefOut); - if (doc.get(QueryBuilder.T) != null) { - doc.removeField(QueryBuilder.T); - if (g == Genre.XREFABLE) { - doc.add(new Field(QueryBuilder.T, g.typeName(), - AnalyzerGuru.string_ft_stored_nanalyzed_norms)); - } - } + if (fa == null) { + this.g = Genre.DATA; + LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname); + //TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ... + } else if (!(fa instanceof BZip2Analyzer)) { + analyzeUncompressed(doc, xrefOut, fa, bzSrc); } } } @@ -126,6 +112,11 @@ public InputStream getStream() throws IOException { throw new IOException("Not BZIP2 format"); } } + + @Override + public String getSourceIdentifier() { + return src.getSourceIdentifier(); + } }; } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java new file mode 100644 index 00000000000..4306e2c5174 --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2017-2020, Chris Fraire . + */ + +package org.opengrok.indexer.analysis.archive; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.opengrok.indexer.analysis.AbstractAnalyzer; +import org.opengrok.indexer.analysis.AnalyzerFactory; +import org.opengrok.indexer.analysis.AnalyzerGuru; +import org.opengrok.indexer.analysis.FileAnalyzer; +import org.opengrok.indexer.analysis.StreamSource; +import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory; +import org.opengrok.indexer.configuration.RuntimeEnvironment; +import org.opengrok.indexer.logger.LoggerFactory; +import org.opengrok.indexer.search.QueryBuilder; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Writer; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * Represents a base for compressed formats (e.g. gzip or bzip2) but not for + * archive formats that have compression (e.g. Zip or Jar). + * @author Chandan + */ +public abstract class CompressedAnalyzer extends FileAnalyzer { + + private static final Logger LOGGER = LoggerFactory.getLogger(CompressedAnalyzer.class); + + private static final int CHUNK_SIZE = 8 * 1024; + + protected Genre g; + + @Override + public Genre getGenre() { + if (g != null) { + return g; + } + return super.getGenre(); + } + + protected CompressedAnalyzer(AnalyzerFactory factory) { + super(factory); + } + + protected void analyzeUncompressed( + Document doc, Writer xrefOut, AbstractAnalyzer fa, StreamSource compressedSrc) + throws IOException, InterruptedException { + + if (fa.getGenre() == Genre.PLAIN) { + if (meetsHugeTextThreshold(compressedSrc)) { + String origFileTypeName = fa.getFileTypeName(); + fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer(); + g = Genre.DATA; + if (LOGGER.isLoggable(Level.WARNING)) { + LOGGER.log(Level.WARNING, "{0} is compressed huge text: {1}", + new Object[]{origFileTypeName, compressedSrc.getSourceIdentifier()}); + } + } else { + g = Genre.XREFABLE; + } + } else if (fa.getGenre() == Genre.XREFABLE) { + g = Genre.XREFABLE; + } else { + g = Genre.DATA; + } + + fa.analyze(doc, compressedSrc, xrefOut); + if (doc.get(QueryBuilder.T) != null) { + doc.removeField(QueryBuilder.T); + } + doc.add(new Field(QueryBuilder.T, g.typeName(), + AnalyzerGuru.string_ft_stored_nanalyzed_norms)); + } + + private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOException { + RuntimeEnvironment env = RuntimeEnvironment.getInstance(); + int hugeTextThresholdBytes = env.getHugeTextThresholdBytes(); + if (Integer.MAX_VALUE == hugeTextThresholdBytes) { + // Don't bother decompressing to count if the limit is MAX_VALUE. + return false; + } + + try (InputStream in = compressedSrc.getStream()) { + // Try skip first. + SkipResult result = meetsHugeTextThresholdBySkip(in, hugeTextThresholdBytes); + if (result.didMeet) { + return true; + } + + // Even if some skipped, only read==-1 is a true indicator of EOF. + long bytesRead = result.bytesSkipped; + byte[] buf = new byte[CHUNK_SIZE]; + long n; + while ((n = in.read(buf, 0, buf.length)) != -1) { + bytesRead += n; + if (bytesRead >= hugeTextThresholdBytes) { + return true; + } + } + } + return false; + } + + private SkipResult meetsHugeTextThresholdBySkip(InputStream in, int hugeTextThresholdBytes) { + long bytesSkipped = 0; + long n; + try { + while ((n = in.skip(CHUNK_SIZE)) > 0) { + bytesSkipped += n; + if (bytesSkipped >= hugeTextThresholdBytes) { + return new SkipResult(bytesSkipped, true); + } + } + } catch (IOException ignored) { + // Ignore and assume not capable of skip. + } + return new SkipResult(bytesSkipped, false); + } + + private static class SkipResult { + final long bytesSkipped; + final boolean didMeet; + + SkipResult(long bytesSkipped, boolean didMeet) { + this.bytesSkipped = bytesSkipped; + this.didMeet = didMeet; + } + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java index 839e6594d59..e735dc49724 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java @@ -32,11 +32,9 @@ import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.opengrok.indexer.analysis.AbstractAnalyzer; import org.opengrok.indexer.analysis.AnalyzerFactory; import org.opengrok.indexer.analysis.AnalyzerGuru; -import org.opengrok.indexer.analysis.FileAnalyzer; import org.opengrok.indexer.analysis.StreamSource; import org.opengrok.indexer.logger.LoggerFactory; import org.opengrok.indexer.search.QueryBuilder; @@ -47,20 +45,10 @@ * Created on September 22, 2005 * @author Chandan */ -public class GZIPAnalyzer extends FileAnalyzer { +public class GZIPAnalyzer extends CompressedAnalyzer { private static final Logger LOGGER = LoggerFactory.getLogger(GZIPAnalyzer.class); - private Genre g; - - @Override - public Genre getGenre() { - if (g != null) { - return g; - } - return super.getGenre(); - } - protected GZIPAnalyzer(AnalyzerFactory factory) { super(factory); } @@ -77,11 +65,11 @@ public String getCtagsLang() { * Gets a version number to be used to tag processed documents so that * re-analysis can be re-done later if a stored version number is different * from the current implementation. - * @return 20180111_00 + * @return 20200417_00 */ @Override protected int getSpecializedVersionNo() { - return 20180111_00; // Edit comment above too! + return 20200417_00; // Edit comment above too! } @Override @@ -93,30 +81,16 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut) String path = doc.get(QueryBuilder.PATH); if (path != null && path.toLowerCase(Locale.ROOT).endsWith(".gz")) { String newname = path.substring(0, path.length() - 3); - //System.err.println("GZIPPED OF = " + newname); try (InputStream gzis = gzSrc.getStream()) { fa = AnalyzerGuru.getAnalyzer(gzis, newname); } if (fa == null) { this.g = Genre.DATA; - LOGGER.log(Level.WARNING, "Did not analyze {0}, detected as data.", newname); + LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname); //TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ... } else { // cant recurse! //simple file gziped case captured here - if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) { - this.g = Genre.XREFABLE; - } else { - this.g = Genre.DATA; - } - fa.analyze(doc, gzSrc, xrefOut); - if (doc.get(QueryBuilder.T) != null) { - doc.removeField(QueryBuilder.T); - if (g == Genre.XREFABLE) { - doc.add(new Field(QueryBuilder.T, g.typeName(), - AnalyzerGuru.string_ft_stored_nanalyzed_norms)); - } - } - + analyzeUncompressed(doc, xrefOut, fa, gzSrc); } } } @@ -131,6 +105,11 @@ public InputStream getStream() throws IOException { return new BufferedInputStream( new GZIPInputStream(src.getStream())); } + + @Override + public String getSourceIdentifier() { + return src.getSourceIdentifier(); + } }; } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java new file mode 100644 index 00000000000..118cd9a533f --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.analysis.data; + +import org.apache.lucene.document.Document; +import org.opengrok.indexer.analysis.AnalyzerFactory; +import org.opengrok.indexer.analysis.FileAnalyzer; +import org.opengrok.indexer.analysis.OGKTextField; +import org.opengrok.indexer.analysis.StreamSource; +import org.opengrok.indexer.configuration.RuntimeEnvironment; +import org.opengrok.indexer.search.QueryBuilder; +import org.opengrok.indexer.util.LimitedReader; +import org.opengrok.indexer.util.IOUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.Writer; +import java.nio.charset.StandardCharsets; + +/** + * Represents an analyzer for huge text data files that are not eligible for + * xref. + */ +public class HugeTextAnalyzer extends FileAnalyzer { + + /** + * Creates a new instance. + * @param factory defined instance for the analyzer + */ + protected HugeTextAnalyzer(AnalyzerFactory factory) { + super(factory); + } + + /** + * @return {@code null} as there is no aligned language + */ + @Override + public String getCtagsLang() { + return null; + } + + /** + * Gets a version number to be used to tag processed documents so that + * re-analysis can be re-done later if a stored version number is different + * from the current implementation. + * @return 20200415_00 + */ + @Override + protected int getSpecializedVersionNo() { + return 20200415_00; // Edit comment above too! + } + + @Override + public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException { + /* + * Though we don't intend to xref, Lucene demands consistency or else it + * would throw IllegalArgumentException: cannot change field "full" from + * index options=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS to + * inconsistent index options=DOCS_AND_FREQS_AND_POSITIONS + */ + doc.add(new OGKTextField(QueryBuilder.FULL, getReader(src.getStream()))); + } + + protected Reader getReader(InputStream stream) throws IOException { + // sourceRoot is read with UTF-8 as a default. + return new LimitedReader(IOUtils.createBOMStrippedReader(stream, + StandardCharsets.UTF_8.name()), + RuntimeEnvironment.getInstance().getHugeTextLimitCharacters()); + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java new file mode 100644 index 00000000000..f3f84651e1d --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.analysis.data; + +import org.opengrok.indexer.analysis.AbstractAnalyzer; +import org.opengrok.indexer.analysis.FileAnalyzerFactory; + +/** + * Represents a factory for creating {@link HugeTextAnalyzer} instances. + */ +public class HugeTextAnalyzerFactory extends FileAnalyzerFactory { + + private static final String NAME = "Huge Text"; + + /** + * Gets a factory instance with no associated file extensions nor magic nor + * any other mapping attribute. + */ + public static final HugeTextAnalyzerFactory DEFAULT_INSTANCE = new HugeTextAnalyzerFactory(); + + private HugeTextAnalyzerFactory() { + super(null, null, null, null, null, null, AbstractAnalyzer.Genre.DATA, NAME); + } + + /** + * Creates a new {@link HugeTextAnalyzer} instance. + * @return a defined instance + */ + @Override + protected AbstractAnalyzer newAnalyzer() { + return new HugeTextAnalyzer(this); + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java index 666929ffa9f..2df3e93abaa 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java @@ -76,6 +76,8 @@ public final class Configuration { private static final Logger LOGGER = LoggerFactory.getLogger(Configuration.class); public static final String PLUGIN_DIRECTORY_DEFAULT = "plugins"; + public static final int HUGE_TEXT_THRESHOLD_BYTES_DEFAULT = 1_000_000; + public static final int HUGE_TEXT_LIMIT_CHARACTERS_DEFAULT = 5_000_000; /** * A check if a pattern contains at least one pair of parentheses meaning @@ -301,6 +303,9 @@ public final class Configuration { private Set disabledRepositories; + private int hugeTextThresholdBytes; + private int hugeTextLimitCharacters; + /* * types of handling history for remote SCM repositories: * ON - index history and display it in webapp @@ -526,6 +531,8 @@ public Configuration() { setHistoryCacheTime(30); setHistoryEnabled(true); setHitsPerPage(25); + setHugeTextLimitCharacters(HUGE_TEXT_LIMIT_CHARACTERS_DEFAULT); + setHugeTextThresholdBytes(HUGE_TEXT_THRESHOLD_BYTES_DEFAULT); setIgnoredNames(new IgnoredNames()); setIncludedNames(new Filter()); setIndexVersionedFilesOnly(false); @@ -1323,6 +1330,37 @@ public void setDisabledRepositories(Set disabledRepositories) { this.disabledRepositories = disabledRepositories; } + /** + * Gets the number of bytes at which a plain-text file will be analyzed + * as a huge text data file and be ineligible for xref. Default is 1_000_000. + */ + public int getHugeTextThresholdBytes() { + return hugeTextThresholdBytes; + } + + /** + * Sets the number of bytes at which a plain-text file will be analyzed + * as a huge text data file and be ineligible for xref. + */ + public void setHugeTextThresholdBytes(int value) { + hugeTextThresholdBytes = Math.max(value, 0); + } + + /** + * Gets the number of characters to analyze from a huge text data file. + * Default is 5_000_000. + */ + public int getHugeTextLimitCharacters() { + return hugeTextLimitCharacters; + } + + /** + * Sets the number of characters to analyze from a huge text data file. + */ + public void setHugeTextLimitCharacters(int value) { + hugeTextLimitCharacters = Math.max(value, 0); + } + /** * Write the current configuration to a file. * diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/ConfigurationHelp.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/ConfigurationHelp.java index 81ea875c018..8f6be420edf 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/ConfigurationHelp.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/ConfigurationHelp.java @@ -92,10 +92,15 @@ public static String getSamples() throws RuntimeException { mthd); } + String propertyName = mthd.getName().replaceFirst("^set", ""); + sample = conf.getXMLRepresentationAsString(); sample = sample.replaceFirst( "(?sx)^<\\?xml.*Configuration\\d*\">\\n", ""); sample = sample.replaceFirst("\\n", ""); + // With Java 11 the following excision is necessary. + sample = sample.replaceFirst("(?isx)^.*\\n(?=\\s* disabledRepositories) { syncWriteConfiguration(disabledRepositories, Configuration::setDisabledRepositories); } + /** + * Gets the configured number of bytes at which a plain-text file will be + * analyzed as a huge text data file and be ineligible for xref. + */ + public int getHugeTextThresholdBytes() { + return syncReadConfiguration(Configuration::getHugeTextThresholdBytes); + } + + /** + * Sets the configured number of bytes at which a plain-text file will be + * analyzed as a huge text data file and be ineligible for xref. + */ + public void setHugeTextThresholdBytes(int hugeTextThresholdBytes) { + syncWriteConfiguration(hugeTextThresholdBytes, Configuration::setHugeTextThresholdBytes); + } + + /** + * Gets the configured number of characters to analyze from a huge text + * data file. + */ + public int getHugeTextLimitCharacters() { + return syncReadConfiguration(Configuration::getHugeTextLimitCharacters); + } + + /** + * Sets the configured number of characters to analyze from a huge text + * data file. + */ + public void setHugeTextLimitCharacters(int hugeTextLimitCharacters) { + syncWriteConfiguration(hugeTextLimitCharacters, Configuration::setHugeTextLimitCharacters); + } + /** * Read an configuration file and set it as the current configuration. * diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/DefaultIndexChangedListener.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/DefaultIndexChangedListener.java index d7cd2fcfbf6..b4df2ec0772 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/DefaultIndexChangedListener.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/DefaultIndexChangedListener.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2018, Chris Fraire . + * Portions Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.index; @@ -50,10 +50,6 @@ public void fileAdd(String path, String analyzer) { public void fileRemove(String path) { LOGGER.log(Level.FINE, "Remove file:{0}", path); } - @Override - public void fileUpdate(String path) { - LOGGER.log(Level.FINE, "Update: {0}", path); - } @Override public void fileAdded(String path, String analyzer) { diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexChangedListener.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexChangedListener.java index d7fa0921053..c0700775104 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexChangedListener.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexChangedListener.java @@ -19,6 +19,7 @@ /* * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2020, Chris Fraire . */ package org.opengrok.indexer.index; @@ -50,9 +51,4 @@ public interface IndexChangedListener { * @param path The path to the file (absolute from source root) */ void fileRemoved(String path); - /** - * A file is to be updated in the index database. - * @param path The path to the file (absolute from source root) - */ - void fileUpdate(String path); } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java index 948f527fa2d..7e395255908 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java @@ -23,14 +23,11 @@ */ package org.opengrok.indexer.index; -import java.io.BufferedInputStream; import java.io.BufferedWriter; import java.io.File; -import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.file.Files; @@ -118,6 +115,7 @@ public class IndexDatabase { (File p1, File p2) -> p1.getName().compareTo(p2.getName()); private static final Set CHECK_FIELDS; + private static final RuntimeEnvironment env = RuntimeEnvironment.getInstance(); private final Object INSTANCE_LOCK = new Object(); @@ -129,7 +127,7 @@ public class IndexDatabase { private final Map indexedSymlinks = new TreeMap<>( Comparator.comparingInt(String::length).thenComparing(o -> o)); - private Project project; + private final Project project; private FSDirectory indexDirectory; private IndexReader reader; private IndexWriter writer; @@ -138,7 +136,6 @@ public class IndexDatabase { private TermsEnum uidIter; private PostingsEnum postsIter; private PathAccepter pathAccepter; - private AnalyzerGuru analyzerGuru; private File xrefDir; private boolean interrupted; private CopyOnWriteArrayList listeners; @@ -191,7 +188,7 @@ public IndexDatabase(Project project) throws IOException { */ static CountDownLatch updateAll(IndexChangedListener listener) throws IOException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); + List dbs = new ArrayList<>(); if (env.hasProjects()) { @@ -202,8 +199,7 @@ static CountDownLatch updateAll(IndexChangedListener listener) dbs.add(new IndexDatabase()); } - IndexerParallelizer parallelizer = RuntimeEnvironment.getInstance(). - getIndexerParallelizer(); + IndexerParallelizer parallelizer = env.getIndexerParallelizer(); CountDownLatch latch = new CountDownLatch(dbs.size()); for (IndexDatabase d : dbs) { final IndexDatabase db = d; @@ -236,7 +232,6 @@ public void run() { * @param paths list of paths to be indexed */ public static void update(IndexChangedListener listener, List paths) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); IndexerParallelizer parallelizer = env.getIndexerParallelizer(); List dbs = new ArrayList<>(); @@ -291,7 +286,6 @@ public void run() { @SuppressWarnings("PMD.CollapsibleIfStatements") private void initialize() throws IOException { synchronized (INSTANCE_LOCK) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File indexDir = new File(env.getDataRootFile(), INDEX_DIR); if (project != null) { indexDir = new File(indexDir, project.getPath()); @@ -307,7 +301,6 @@ private void initialize() throws IOException { lockfact = pickLockFactory(env); indexDirectory = FSDirectory.open(indexDir.toPath(), lockfact); pathAccepter = env.getPathAccepter(); - analyzerGuru = new AnalyzerGuru(); xrefDir = new File(env.getDataRootFile(), XREF_DIR); listeners = new CopyOnWriteArrayList<>(); dirtyFile = new File(indexDir, "dirty"); @@ -332,7 +325,7 @@ public boolean addDirectory(String dir) { } else if (directory.charAt(0) != '/') { directory = "/" + directory; } - File file = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), directory); + File file = new File(env.getSourceRootFile(), directory); if (file.exists()) { directories.add(directory); return true; @@ -341,15 +334,13 @@ public boolean addDirectory(String dir) { } private void showFileCount(String dir, IndexDownArgs args) { - if (RuntimeEnvironment.getInstance().isPrintProgress()) { + if (env.isPrintProgress()) { LOGGER.log(Level.INFO, String.format("Need to process: %d files for %s", args.cur_count, dir)); } } private void markProjectIndexed(Project project) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); - // Successfully indexed the project. The message is sent even if // the project's isIndexed() is true because it triggers RepositoryInfo // refresh. @@ -391,8 +382,6 @@ public void update() throws IOException { interrupted = false; } - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); - reader = null; writer = null; settings = null; @@ -535,7 +524,6 @@ public void update() throws IOException { */ static CountDownLatch optimizeAll() throws IOException { List dbs = new ArrayList<>(); - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); IndexerParallelizer parallelizer = env.getIndexerParallelizer(); if (env.hasProjects()) { for (Project project : env.getProjectList()) { @@ -658,7 +646,6 @@ private File whatXrefFile(String path, boolean compress) { * @param path path to file under source root */ private void removeXrefFile(String path) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File xrefFile = whatXrefFile(path, env.isCompressXref()); PendingFileDeletion pending = new PendingFileDeletion( xrefFile.getAbsolutePath()); @@ -708,8 +695,7 @@ private void removeFile(boolean removeHistory) throws IOException { private void addFile(File file, String path, Ctags ctags) throws IOException, InterruptedException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); - AbstractAnalyzer fa = getAnalyzerFor(file, path); + AbstractAnalyzer fa = AnalyzerGuru.getAnalyzer(file, path, true); for (IndexChangedListener listener : listeners) { listener.fileAdd(path, fa.getClass().getSimpleName()); @@ -726,7 +712,7 @@ private void addFile(File file, String path, Ctags ctags) Document doc = new Document(); try (Writer xrefOut = newXrefWriter(fa, path)) { - analyzerGuru.populateDocument(doc, file, path, fa, xrefOut); + AnalyzerGuru.populateDocument(doc, file, path, fa, xrefOut); } catch (InterruptedException e) { LOGGER.log(Level.WARNING, "File ''{0}'' interrupted--{1}", new Object[]{path, e.getMessage()}); @@ -760,14 +746,6 @@ private void addFile(File file, String path, Ctags ctags) } } - private AbstractAnalyzer getAnalyzerFor(File file, String path) - throws IOException { - try (InputStream in = new BufferedInputStream( - new FileInputStream(file))) { - return AnalyzerGuru.getAnalyzer(in, path); - } - } - /** * Do a best effort to clean up all resources allocated when populating * a Lucene document. On normal execution, these resources should be @@ -849,7 +827,6 @@ private boolean accept(File file, AcceptSymlinkRet ret) { } // this is an unversioned file, check if it should be indexed - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); boolean res = !env.isIndexVersionedFilesOnly(); if (!res) { LOGGER.log(Level.FINER, "not accepting unversioned {0}", @@ -918,7 +895,6 @@ private boolean acceptSymlink(Path absolute, File canonical, AcceptSymlinkRet re String absolute1 = absolute.toString(); String canonical1 = canonical.getPath(); boolean isCanonicalDir = canonical.isDirectory(); - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); IndexedSymlink indexed1; String absolute0; @@ -1072,7 +1048,6 @@ private boolean acceptSymlink(Path absolute, File canonical, AcceptSymlinkRet re * @return true if the file is local to the current repository */ private boolean isLocal(String path) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); String srcRoot = env.getSourceRootPath(); if (path.startsWith(srcRoot + File.separator)) { @@ -1219,8 +1194,7 @@ private void indexParallel(String dir, IndexDownArgs args) { AtomicInteger successCounter = new AtomicInteger(); AtomicInteger currentCounter = new AtomicInteger(); AtomicInteger alreadyClosedCounter = new AtomicInteger(); - IndexerParallelizer parallelizer = RuntimeEnvironment.getInstance(). - getIndexerParallelizer(); + IndexerParallelizer parallelizer = env.getIndexerParallelizer(); ObjectPool ctagsPool = parallelizer.getCtagsPool(); Map> bySuccess = null; @@ -1342,7 +1316,6 @@ public void addIndexChangedListener(IndexChangedListener listener) { */ public static Set getAllFiles(List subFiles) throws IOException { Set files = new HashSet<>(); - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); if (env.hasProjects()) { if (subFiles == null || subFiles.isEmpty()) { @@ -1442,7 +1415,6 @@ public int getNumFiles() throws IOException { static void listFrequentTokens(List subFiles) throws IOException { final int limit = 4; - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); if (env.hasProjects()) { if (subFiles == null || subFiles.isEmpty()) { for (Project project : env.getProjectList()) { @@ -1508,8 +1480,6 @@ public void listTokens(int freq) throws IOException { */ public static IndexReader getIndexReader(String path) { IndexReader ret = null; - - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File indexDir = new File(env.getDataRootFile(), INDEX_DIR); if (env.hasProjects()) { @@ -1565,7 +1535,7 @@ public static Definitions getDefinitions(File file) throws ParseException, IOExc */ public static Document getDocument(File file) throws IOException, ParseException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); + String path; try { path = env.getPathRelativeToSourceRoot(file); @@ -1639,7 +1609,7 @@ private boolean isXrefWriter(AbstractAnalyzer fa) { */ private Writer newXrefWriter(AbstractAnalyzer fa, String path) throws IOException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); + if (env.isGenerateHtml() && isXrefWriter(fa)) { boolean compressed = env.isCompressXref(); File xrefFile = whatXrefFile(path, compressed); @@ -1710,16 +1680,15 @@ private void finishWriting() throws IOException { } /** - * Verify TABSIZE, and evaluate AnalyzerGuru version together with ZVER -- - * or return a value to indicate mismatch. + * Verify TABSIZE, validate AnalyzerGuru version together with Analyzer + * version, and recheck huge text file constraint -- or return a value to + * indicate mismatch. * @param file the source file object * @param path the source file path * @return {@code false} if a mismatch is detected */ - private boolean checkSettings(File file, - String path) throws IOException { + private boolean checkSettings(File file, String path) throws IOException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); boolean outIsXrefWriter = false; int reqTabSize = project != null && project.hasTabSizeSetting() ? project.getTabSize() : 0; @@ -1761,8 +1730,7 @@ private boolean checkSettings(File file, break; } - AnalyzerFactory fac = - AnalyzerGuru.findByFileTypeName(fileTypeName); + AnalyzerFactory fac = AnalyzerGuru.findByFileTypeName(fileTypeName); if (fac != null) { fa = fac.getAnalyzer(); } @@ -1774,7 +1742,7 @@ private boolean checkSettings(File file, */ LOGGER.log(Level.FINER, "Guru version mismatch: {0}", path); - fa = getAnalyzerFor(file, path); + fa = AnalyzerGuru.getAnalyzer(file, path, false); fileTypeName = fa.getFileTypeName(); String oldTypeName = doc.get(QueryBuilder.TYPE); if (!fileTypeName.equals(oldTypeName)) { @@ -1797,7 +1765,27 @@ private boolean checkSettings(File file, return false; } + // If it is a Huge Text file, re-check constraints. + if (AnalyzerGuru.getHugeTextFileTypeName().equals(fileTypeName) && + file.length() < env.getHugeTextThresholdBytes()) { + if (LOGGER.isLoggable(Level.WARNING)) { + LOGGER.log(Level.WARNING, "{0} no longer qualifies: {1}", + new Object[]{fileTypeName, path}); + } + return false; + } + if (fa != null) { + // If the Genre is PLAIN, re-check Huge Text file constraints. + if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) && + file.length() >= env.getHugeTextThresholdBytes()) { + if (LOGGER.isLoggable(Level.WARNING)) { + LOGGER.log(Level.WARNING, "{0} is now huge text: {1}", + new Object[]{fileTypeName, path}); + } + return false; + } + outIsXrefWriter = isXrefWriter(fa); } @@ -1840,7 +1828,6 @@ private IndexAnalysisSettings3 readAnalysisSettings() throws IOException { } private boolean xrefExistsFor(String path) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File xrefFile = whatXrefFile(path, env.isCompressXref()); if (!xrefFile.exists()) { LOGGER.log(Level.FINEST, "Missing {0}", xrefFile); diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java index 9e8e2ac41bd..e3cc44f4c10 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java @@ -432,7 +432,18 @@ public static String[] parseOptions(String[] argv) throws ParseException { searchPaths.clear(); - // Limit usage lines to 72 characters for concise formatting. + /* + * FOR CONCISE FORMATTING, LIMIT USAGE DESCRIPTION LINES TO + * + * 8888888888 .d8888b. 888 + * d88P d88P Y88b 888 + * d88P 888 888 + * d88P .d88P .d8888b 88888b. 8888b. 888d888 .d8888b + * 88888888 .od888P" d88P" 888 "88b "88b 888P" 88K + * d88P d88P" 888 888 888 .d888888 888 "Y8888b. + * d88P 888" Y88b. 888 888 888 888 888 X88 + * d88P 888888888 "Y8888P 888 888 "Y888888 888 88888P' + */ optParser = OptionParser.execute(parser -> { parser.setPrologue( @@ -550,16 +561,27 @@ public static String[] parseOptions(String[] argv) throws ParseException { parser.on("-H", "--history", "Enable history.").execute(v -> cfg.setHistoryEnabled(true)); + parser.on("--historyRenamedThreads", "=number", Integer.class, + "The number of threads to use for history cache generation when dealing", + "with renamed files. By default the number of threads will be set to the", + "number of available CPUs. Assumes --renamedHistory=on").execute(threadCount -> + cfg.setHistoryRenamedParallelism((Integer) threadCount)); + parser.on("--historyThreads", "=number", Integer.class, - "The number of threads to use for history cache generation. By default the number", - "of threads will be set to the number of available CPUs. Assumes -H/--history.").execute(threadCount -> + "The number of threads to use for history cache generation. By default", + "the number of threads will be set to the number of available CPUs.", + "Assumes -H/--history.").execute(threadCount -> cfg.setHistoryParallelism((Integer) threadCount)); - parser.on("--historyRenamedThreads", "=number", Integer.class, - "The number of threads to use for history cache generation when dealing with renamed files.", - "By default the number of threads will be set to the number of available CPUs.", - "Assumes --renamedHistory=on").execute(threadCount -> - cfg.setHistoryRenamedParallelism((Integer) threadCount)); + parser.on("--hugeBytes", "=number", Integer.class, + "Threshold number of bytes to qualify a Huge Text data file vs a plain-", + "text source code file. Default is 1_000_000.").execute(value -> + cfg.setHugeTextThresholdBytes((int) value)); + + parser.on("--hugeCharacters", "=number", Integer.class, + "Limit for number of characters to read and index from a Huge Text data", + "file. Default is 5_000_000.").execute(value -> + cfg.setHugeTextLimitCharacters((int) value)); parser.on("-I", "--include", "=pattern", "Only files matching this pattern will be examined. Pattern supports", diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/util/LimitedReader.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/util/LimitedReader.java new file mode 100644 index 00000000000..5c106e03439 --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/util/LimitedReader.java @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.util; + +import java.io.IOException; +import java.io.Reader; + +/** + * Represents a {@link Reader} wrapper that limits characters read as specified + * and only up to {@link Integer#MAX_VALUE} to accommodate Lucene offset limits. + */ +public class LimitedReader extends Reader { + + private final int characterLimit; + private final Reader underlying; + private int characterCount; + private boolean didEOF; + + /** + * Initializes a new instance to wrap the specified {@code underlying}. + * @param underlying a defined instance + * @param characterLimit a non-negative number or alternatively a negative + * number to indicate {@link Integer#MAX_VALUE} + */ + public LimitedReader(Reader underlying, int characterLimit) { + if (underlying == null) { + throw new IllegalArgumentException("underlying is null"); + } + this.underlying = underlying; + this.characterLimit = characterLimit < 0 ? Integer.MAX_VALUE : characterLimit; + } + + /** + * Calls {@link Reader#read()} on the underlying {@link Reader} but only + * up to {@code characterLimit}, after which EOF will be indicated. + * @return The number of characters read, or -1 if the end of the stream or + * the {@code characterLimit} has been reached + */ + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + if (didEOF) { + return -1; + } + + int adjustedLen = Math.min(len, characterLimit - characterCount); + int ret = underlying.read(cbuf, off, adjustedLen); + if (ret < 0) { + didEOF = true; + return -1; + } + characterCount += ret; + if (characterCount >= characterLimit) { + didEOF = true; + } + return ret; + } + + /** + * Calls {@link Reader#close()} on the underlying {@link Reader}. + */ + @Override + public void close() throws IOException { + underlying.close(); + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SearchHelper.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SearchHelper.java index c9be44dd108..ca38ff60ad9 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SearchHelper.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SearchHelper.java @@ -641,12 +641,11 @@ public void destroy() { /** * Searches for a document for a single file from the index. * @param file the file whose definitions to find - * @return {@link ScoreDoc#doc} or -1 if it could not be found + * @return a defined instance or {@code null} if not found * @throws IOException if an error happens when accessing the index * @throws ParseException if an error happens when building the Lucene query */ - public int searchSingle(File file) throws IOException, - ParseException { + public SingleResult searchSingleResult(File file) throws IOException, ParseException { RuntimeEnvironment env = RuntimeEnvironment.getInstance(); String path; @@ -654,7 +653,7 @@ public int searchSingle(File file) throws IOException, path = env.getPathRelativeToSourceRoot(file); } catch (ForbiddenSymlinkException e) { LOGGER.log(Level.FINER, e.getMessage()); - return -1; + return null; } //sanitize windows path delimiters //in order not to conflict with Lucene escape character @@ -668,7 +667,7 @@ public int searchSingle(File file) throws IOException, TopDocs top = searcher.search(query, 1); if (top.totalHits.value == 0) { - return -1; + return null; } int docID = top.scoreDocs[0].doc; @@ -677,10 +676,41 @@ public int searchSingle(File file) throws IOException, String foundPath = doc.get(QueryBuilder.PATH); // Only use the result if PATH matches exactly. if (!path.equals(foundPath)) { - return -1; + return null; } - return docID; + return new SingleResult(doc, docID); + } + + /** + * Searches for a document for a single file from the index. + * @param file the file whose definitions to find + * @return {@link ScoreDoc#doc} or -1 if it could not be found + * @throws IOException if an error happens when accessing the index + * @throws ParseException if an error happens when building the Lucene query + */ + public int searchSingle(File file) throws IOException, ParseException { + SingleResult result = searchSingleResult(file); + if (result != null) { + return result.getDocID(); + } + return -1; + } + + /** + * Searches for a document for a single file from the index to retrieve its + * {@link AbstractAnalyzer.Genre}. + * @param file the file whose definitions to find + * @return a defined instance or {@code null} if not found + * @throws IOException if an error happens when accessing the index + * @throws ParseException if an error happens when building the Lucene query + */ + public AbstractAnalyzer.Genre searchSingleGenre(File file) throws IOException, ParseException { + SingleResult result = searchSingleResult(file); + if (result != null) { + return AbstractAnalyzer.Genre.get(result.getDocument().get(QueryBuilder.T)); + } + return null; } /** diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SingleResult.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SingleResult.java new file mode 100644 index 00000000000..2e1055693ae --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SingleResult.java @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.web; + +import org.apache.lucene.document.Document; + +/** + * Represents a single-document search result. + */ +public class SingleResult { + private final Document document; + private final int docID; + + public SingleResult(Document document, int docID) { + this.document = document; + this.docID = docID; + } + + public Document getDocument() { + return document; + } + + public int getDocID() { + return docID; + } +} diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java index d640c865f0c..59658c168ec 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis; @@ -505,6 +505,11 @@ public void testJavaClassAnalyzer() throws Exception { ".class"; return StringWriter.class.getResourceAsStream(path); } + + @Override + public String getSourceIdentifier() { + return "StringWriter.class"; + } }; Document doc = new Document(); StringWriter out = new StringWriter(); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/LuceneCompatibilityTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/LuceneCompatibilityTest.java index 074fbfaf74c..aab2ae95ba4 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/LuceneCompatibilityTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/LuceneCompatibilityTest.java @@ -72,7 +72,6 @@ public static Test suite() { } } Analyzer testA; - AnalyzerGuru guru; Method testM; Object testC = null; @@ -81,7 +80,6 @@ public static Test suite() { */ @Override protected void setUp() throws Exception { - guru = new AnalyzerGuru(); Class c = Class.forName(LUCENE_TEST_CLASS); //testC = c.newInstance(); //this is static call Class[] argTypes = {TokenStream.class, String[].class, int[].class, int[].class, String[].class, int[].class, int[].class, Integer.class, boolean.class}; @@ -89,7 +87,7 @@ protected void setUp() throws Exception { } public void testCompatibility() throws Exception { - for (AnalyzerFactory fa : guru.getAnalyzerFactories()) { + for (AnalyzerFactory fa : AnalyzerGuru.getAnalyzerFactories()) { String input = "Hello world"; String[] output = new String[]{"Hello", "world"}; testA = fa.getAnalyzer(); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java index 6c3409160d4..f0db02d7d31 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2010, 2018, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017, Chris Fraire . + * Portions Copyright (c) 2017, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis; @@ -49,6 +49,11 @@ private static StreamSource getStreamSource(final byte[] bytes) { public InputStream getStream() throws IOException { return new ByteArrayInputStream(bytes); } + + @Override + public String getSourceIdentifier() { + return "byte[]"; + } }; } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/archive/ZipAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/archive/ZipAnalyzerFactoryTest.java index 12d43697e17..83c75641830 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/archive/ZipAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/archive/ZipAnalyzerFactoryTest.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2018, Chris Fraire . + * Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.archive; @@ -47,7 +47,7 @@ public void testZipWrtAnalyzerGuru() throws IOException { assertNotNull("zip.bin should be available,", res); // assert that it is matched - AnalyzerFactory fac = AnalyzerGuru.find(res); + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(res, AnalyzerGuru.ANONYMOUS_NAME); assertNotNull("zip.bin should have factory", fac); assertSame("should be ZipAnalyzerFactory", fac.getClass(), ZipAnalyzerFactory.class); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CAnalyzerFactoryTest.java index 32d1ea44067..97fbddf8211 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.c; @@ -60,15 +60,6 @@ public class CAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -110,7 +101,7 @@ public void testScopeAnalyzer() throws Exception { StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); analyzer.setScopesEnabled(true); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); assertNotNull(scopesField); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CxxAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CxxAnalyzerFactoryTest.java index 365209353bb..3bf1125eb69 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CxxAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CxxAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.c; @@ -60,15 +60,6 @@ public class CxxAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -111,7 +102,7 @@ public void testScopeAnalyzer() throws Exception { analyzer.setScopesEnabled(true); System.out.println(path); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); assertNotNull(scopesField); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/clojure/ClojureAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/clojure/ClojureAnalyzerFactoryTest.java index 8852f0dca37..ee1cee1dfb5 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/clojure/ClojureAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/clojure/ClojureAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2016, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.clojure; @@ -57,15 +57,6 @@ public class ClojureAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -106,7 +97,7 @@ public void testScopeAnalyzer() throws Exception { string_ft_nstored_nanalyzed_norms)); StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); Definitions definitions = Definitions.deserialize(doc.getField(QueryBuilder.TAGS).binaryValue().bytes); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/csharp/CSharpAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/csharp/CSharpAnalyzerFactoryTest.java index 8b2a6b0ecd5..cda1cc09547 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/csharp/CSharpAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/csharp/CSharpAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.csharp; @@ -56,15 +56,6 @@ public class CSharpAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -105,7 +96,7 @@ public void testScopeAnalyzer() throws Exception { StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); analyzer.setScopesEnabled(true); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); assertNotNull(scopesField); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/document/TroffAnalyzerTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/document/TroffAnalyzerTest.java index 48356f2a265..ec67d9798ce 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/document/TroffAnalyzerTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/document/TroffAnalyzerTest.java @@ -20,6 +20,7 @@ /* * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved. * Portions copyright 2009 - 2011 Jens Elkner. + * Portions Copyright (c) 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.document; @@ -124,6 +125,11 @@ public void testAnalyze() throws IOException { public InputStream getStream() throws IOException { return new ByteArrayInputStream(content.getBytes()); } + + @Override + public String getSourceIdentifier() { + return "String"; + } }, xrefOut); } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JarAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JarAnalyzerFactoryTest.java index 661cc9bd569..7a03f3f0cd0 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JarAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JarAnalyzerFactoryTest.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2018, Chris Fraire . + * Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.executables; @@ -47,7 +47,7 @@ public void testJarWrtAnalyzerGuru() throws IOException { assertNotNull("javajar.bin should be available,", res); // assert that it is matched - AnalyzerFactory fac = AnalyzerGuru.find(res); + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(res, AnalyzerGuru.ANONYMOUS_NAME); assertNotNull("javajar.bin should have factory", fac); assertSame("should be JarAnalyzerFactory", fac.getClass(), JarAnalyzerFactory.class); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactoryTest.java index a6123eec18d..c00a02c45a6 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactoryTest.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2017, Chris Fraire . + * Copyright (c) 2017, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.executables; @@ -48,7 +48,7 @@ public void testJavaClassWrtAnalyzerGuru() throws IOException { assertNotNull("despite inclusion locally,", res); // assert that it is matched - AnalyzerFactory fac = AnalyzerGuru.find(res); + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(res, AnalyzerGuru.ANONYMOUS_NAME); assertNotNull("javaclass.bin should have factory", fac); assertSame("should be JavaClassAnalyzerFactory", fac.getClass(), JavaClassAnalyzerFactory.class); @@ -64,7 +64,7 @@ public void testDylibCafebabeWrtAnalyzerGuru() throws IOException { "analysis/executables/fat.dylib"); assertNotNull("despite inclusion locally,", res); - AnalyzerFactory fac = AnalyzerGuru.find(res); + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(res, AnalyzerGuru.ANONYMOUS_NAME); if (fac != null) { assertNotSame("should not be JavaClassAnalyzerFactory", fac.getClass(), JavaClassAnalyzerFactory.class); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/java/JavaAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/java/JavaAnalyzerFactoryTest.java index be100a8739e..f4599ff22f5 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/java/JavaAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/java/JavaAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.java; @@ -60,15 +60,6 @@ public class JavaAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -108,7 +99,7 @@ public void testScopeAnalyzer() throws Exception { StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); analyzer.setScopesEnabled(true); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); assertNotNull(scopesField); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/pascal/PascalAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/pascal/PascalAnalyzerFactoryTest.java index 6021442d69c..f17d80fcec3 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/pascal/PascalAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/pascal/PascalAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2016, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.pascal; @@ -58,15 +58,6 @@ public class PascalAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -108,7 +99,7 @@ public void testAnalyzer() throws Exception { StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); analyzer.setScopesEnabled(true); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); Definitions definitions = Definitions.deserialize(doc.getField(QueryBuilder.TAGS).binaryValue().bytes); assertNotNull(definitions); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/DefinitionsTokenStreamTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/DefinitionsTokenStreamTest.java index fcc2d20df9b..e384cdb3865 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/DefinitionsTokenStreamTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/DefinitionsTokenStreamTest.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2018, Chris Fraire . + * Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.plain; @@ -174,6 +174,11 @@ public InputStream getStream() throws IOException { assertNotNull(name + " as resource,", srcres); return srcres; } + + @Override + public String getSourceIdentifier() { + return name; + } }; } } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/configuration/ConfigurationHelpTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/configuration/ConfigurationHelpTest.java index 6312821c2d9..b3d4c63cbc8 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/configuration/ConfigurationHelpTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/configuration/ConfigurationHelpTest.java @@ -18,11 +18,12 @@ */ /* - * Copyright (c) 2018, Chris Fraire . + * Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.configuration; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.junit.Test; @@ -33,9 +34,12 @@ public class ConfigurationHelpTest { @Test public void shouldCreateReadableUsage() { String samples = ConfigurationHelp.getSamples(); - assertTrue("samples are not empty", !samples.isEmpty()); + assertFalse("samples are not empty", samples.isEmpty()); assertTrue("samples contains \"\n" + + " \n")); } } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java new file mode 100644 index 00000000000..3cd55d1f0a1 --- /dev/null +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java @@ -0,0 +1,186 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2017-2020, Chris Fraire . + * Portions Copyright (c) 2020, Ric Harris . + */ + +package org.opengrok.indexer.index; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.opengrok.indexer.condition.ConditionalRunRule; +import org.opengrok.indexer.configuration.Project; +import org.opengrok.indexer.configuration.RuntimeEnvironment; +import org.opengrok.indexer.history.RepositoryFactory; +import org.opengrok.indexer.util.TestRepository; +import org.opengrok.indexer.web.Util; + +import java.io.IOException; +import java.util.Queue; +import java.util.concurrent.ConcurrentLinkedQueue; + +/** + * @author Trond Norbye + */ +public class HugeTextTest { + + private static RuntimeEnvironment env; + private TestRepository repository; + private int savedHugeTextLimitCharacters; + private int savedHugeTextThresholdBytes; + + @Rule + public ConditionalRunRule rule = new ConditionalRunRule(); + + @BeforeClass + public static void setUpClass() { + env = RuntimeEnvironment.getInstance(); + RepositoryFactory.initializeIgnoredNames(env); + } + + @Before + public void setUp() throws IOException { + repository = new TestRepository(); + repository.create(HugeTextTest.class.getResourceAsStream("source.zip")); + + savedHugeTextLimitCharacters = env.getHugeTextLimitCharacters(); + savedHugeTextThresholdBytes = env.getHugeTextThresholdBytes(); + } + + @After + public void tearDown() { + repository.destroy(); + + env.setHugeTextLimitCharacters(savedHugeTextLimitCharacters); + env.setHugeTextThresholdBytes(savedHugeTextThresholdBytes); + } + + @Test + public void shouldIndexFilesPerChangingHugeTextSettings() throws Exception { + env.setSourceRoot(repository.getSourceRoot()); + env.setDataRoot(repository.getDataRoot()); + env.setRepositories(repository.getSourceRoot()); + + Project project = new Project("sql"); + project.setPath("/sql"); + + IndexDatabase idb = new IndexDatabase(project); + ConcurrentIndexChangeListener listener = new ConcurrentIndexChangeListener(); + idb.addIndexChangedListener(listener); + idb.update(); + assertEquals("should add expected files",2, listener.addedFiles.size()); + assertTrue("removedFiles should be empty", listener.removedFiles.isEmpty()); + assertTrue("should have added /sql/test.sql", listener.addedFiles.contains( + new AddedFile("/sql/test.sql", "SQLAnalyzer"))); + assertTrue("should have added /sql/test.sql", listener.addedFiles.contains( + new AddedFile("/sql/bug18586.sql", "SQLAnalyzer"))); + + env.setHugeTextThresholdBytes(300); + listener.reset(); + idb.update(); + assertEquals("should add expected files",1, listener.addedFiles.size()); + assertEquals("should remove expected files",1, listener.removedFiles.size()); + assertTrue("should have added /sql/test.sql", listener.addedFiles.contains( + new AddedFile("/sql/test.sql", "HugeTextAnalyzer"))); + assertTrue("should have removed /sql/test.sql", listener.removedFiles.contains( + "/sql/test.sql")); + + env.setHugeTextThresholdBytes(savedHugeTextThresholdBytes); + listener.reset(); + idb.update(); + assertEquals("should add expected files",1, listener.addedFiles.size()); + assertEquals("should remove expected files",1, listener.removedFiles.size()); + assertTrue("should have added /sql/test.sql", listener.addedFiles.contains( + new AddedFile("/sql/test.sql", "SQLAnalyzer"))); + assertTrue("should have removed /sql/test.sql", listener.removedFiles.contains( + "/sql/test.sql")); + } + + private static class ConcurrentIndexChangeListener implements IndexChangedListener { + + final Queue addedFiles = new ConcurrentLinkedQueue<>(); + final Queue removedFiles = new ConcurrentLinkedQueue<>(); + + @Override + public void fileAdd(String path, String analyzer) { + } + + @Override + public void fileAdded(String path, String analyzer) { + addedFiles.add(new AddedFile(Util.fixPathIfWindows(path), analyzer)); + } + + @Override + public void fileRemove(String path) { + } + + @Override + public void fileRemoved(String path) { + removedFiles.add(Util.fixPathIfWindows(path)); + } + + void reset() { + this.addedFiles.clear(); + this.removedFiles.clear(); + } + } + + private static class AddedFile { + final String path; + final String analyzer; + + AddedFile(String path, String analyzer) { + this.path = path; + this.analyzer = analyzer; + } + + /** Generated by IntelliJ. */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) { + return false; + } + + AddedFile addedFile = (AddedFile) o; + + if (!path.equals(addedFile.path)) { + return false; + } + return analyzer.equals(addedFile.analyzer); + } + + /** Generated by IntelliJ. */ + @Override + public int hashCode() { + int result = path.hashCode(); + result = 31 * result + analyzer.hashCode(); + return result; + } + } +} diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerTest.java index f01bc63a9d4..4f212504bbe 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . * Portions Copyright (c) 2020, Ric Harris . */ package org.opengrok.indexer.index; @@ -202,10 +202,6 @@ public void fileAdded(String path, String analyzer) { public void fileRemove(String path) { } - @Override - public void fileUpdate(String path) { - } - @Override public void fileRemoved(String path) { removedFiles.add(path); @@ -281,10 +277,6 @@ public void fileAdded(String path, String analyzer) { public void fileRemove(String path) { } - @Override - public void fileUpdate(String path) { - } - @Override public void fileRemoved(String path) { // The test for the file existence needs to be performed here diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/util/LimitedReaderTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/util/LimitedReaderTest.java new file mode 100644 index 00000000000..c59b3adeffc --- /dev/null +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/util/LimitedReaderTest.java @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.util; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +import java.io.IOException; +import java.io.StringReader; + +/** + * Represents a container for tests of {@link LimitedReader}. + */ +public class LimitedReaderTest { + + private static final String LIPSUM = "Lorem ipsum dolor sit amet, consectetur adipiscing " + + "elit. Proin dignissim sollicitudin est vitae aliquam. Nam leo nisl, lobortis at " + + "finibus nec, dignissim sed augue. Nullam commodo libero lectus, ac scelerisque ante " + + "luctus ac. Praesent varius volutpat lacinia. Praesent nec vulputate eros."; + + @Test + public void shouldReadToMax() throws IOException { + String value = readToLimit(-1); + assertEquals("should read to max", LIPSUM, value); + } + + @Test + public void shouldReadToTruncated() throws IOException { + String value = readToLimit(10); + assertEquals("should read to truncated", "Lorem ipsu", value); + } + + @Test + public void shouldReadNone() throws IOException { + String value = readToLimit(0); + assertEquals("should read nothing", "", value); + } + + private static String readToLimit(int characterLimit) throws IOException { + StringBuilder b = new StringBuilder(); + char[] buf = new char[37]; + try (LimitedReader reader = new LimitedReader(new StringReader(LIPSUM), characterLimit)) { + int n; + while ((n = reader.read(buf, 0, buf.length)) != -1) { + b.append(buf, 0, n); + } + } + return b.toString(); + } +} diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/util/StreamUtils.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/util/StreamUtils.java index b684ed262ef..8726a4d189e 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/util/StreamUtils.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/util/StreamUtils.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2017, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2018-2019, Chris Fraire . + * Portions Copyright (c) 2018-2020, Chris Fraire . */ package org.opengrok.indexer.util; @@ -162,6 +162,11 @@ public InputStream getStream() { assertNotNull("resource " + resourceName, res); return new BufferedInputStream(res); } + + @Override + public String getSourceIdentifier() { + return resourceName; + } }; } diff --git a/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java b/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java index 0e8c4c6de90..e5268d66c90 100644 --- a/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java +++ b/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java @@ -266,7 +266,7 @@ public DiffData getDiffData() { + getUriEncodedPath() + "\">history"; return data; } - data.genre = AnalyzerGuru.getGenre(getResourceFile().getName()); + data.genre = AnalyzerGuru.getGenre(getResourceFile(), getResourceFile().getName()); if (data.genre == null || txtGenres.contains(data.genre)) { InputStream[] in = new InputStream[2]; @@ -298,14 +298,11 @@ public DiffData getDiffData() { * version. */ for (int i = 0; i < 2 && data.genre == null; i++) { - try { - data.genre = AnalyzerGuru.getGenre(in[i]); - } catch (IOException e) { - data.errorMsg = "Unable to determine the file type: " - + Util.htmlize(e.getMessage()); - } + data.genre = AnalyzerGuru.getGenre(in[i], getResourceFile().getName()); + } + if (data.genre == null) { + data.errorMsg = "Unable to determine the file type."; } - if (data.genre != AbstractAnalyzer.Genre.PLAIN && data.genre != AbstractAnalyzer.Genre.HTML) { return data; } @@ -731,14 +728,21 @@ public boolean hasAnnotations() { * * @return {@code true} if annotation is desired and available. */ - public boolean annotate() { + public boolean shouldAnnotate() { if (annotate == null) { - annotate = hasAnnotations() - && Boolean.parseBoolean(req.getParameter(QueryParameters.ANNOTATION_PARAM)); + annotate = wantsAnnotation() && hasAnnotations(); } return annotate; } + /** + * Gets a value indicating if the user submitted an affirmative value for + * the {@link QueryParameters#ANNOTATION_PARAM}. + */ + public boolean wantsAnnotation() { + return Boolean.parseBoolean(req.getParameter(QueryParameters.ANNOTATION_PARAM)); + } + /** * Get the annotation for the requested resource. * @@ -746,7 +750,7 @@ public boolean annotate() { * the cached annotation otherwise. */ public Annotation getAnnotation() { - if (isDir() || getResourcePath().equals("/") || !annotate()) { + if (isDir() || getResourcePath().equals("/") || !shouldAnnotate()) { return null; } if (annotation != null) { @@ -1488,6 +1492,11 @@ public File getDataRoot() { * executing the prepared query or continue processing. *

* This method stops populating fields as soon as an error occurs. + *

+ * The result is stored as a request attribute keyed to + * {@link SearchHelper#REQUEST_ATTR} for later cleanup via + * {@link SearchHelper#destroy()}. Any object already set will have + * {@link SearchHelper#destroy()} called. * * @return a search helper. */ @@ -1519,9 +1528,21 @@ public SearchHelper prepareSearch() { * executing the prepared query or continue processing. *

* This method stops populating fields as soon as an error occurs. + *

+ * The result is stored as a request attribute keyed to + * {@link SearchHelper#REQUEST_ATTR} for later cleanup via + * {@link SearchHelper#destroy()}. Any object already set will have + * {@link SearchHelper#destroy()} called. + * * @return a search helper. */ public SearchHelper prepareInternalSearch() { + Object cached = req.getAttribute(SearchHelper.REQUEST_ATTR); + if (cached != null) { + req.setAttribute(SearchHelper.REQUEST_ATTR, null); + ((SearchHelper) cached).destroy(); + } + SearchHelper sh = new SearchHelper(); sh.dataRoot = getDataRoot(); // throws Exception if none-existent sh.order = SortOrder.RELEVANCY; @@ -1537,6 +1558,13 @@ public SearchHelper prepareInternalSearch() { sh.sourceRoot = new File(getSourceRootPath()); String xrValue = req.getParameter(QueryParameters.NO_REDIRECT_PARAM); sh.noRedirect = xrValue != null && !xrValue.isEmpty(); + + /* + * N.b. searchHelper.destroy() is called via + * WebappListener.requestDestroyed() on presence of the following + * REQUEST_ATTR. + */ + req.setAttribute(SearchHelper.REQUEST_ATTR, sh); return sh; } @@ -1776,7 +1804,9 @@ public boolean isNotModified(HttpServletRequest request, HttpServletResponse res // last timestamp value getEnv().getDateForLastIndexRun() != null ? getEnv().getDateForLastIndexRun().getTime() : 0, // OpenGrok version has changed since the last time - Info.getVersion() + Info.getVersion(), + // Whether the user indicated to annotate + wantsAnnotation() ) ); diff --git a/opengrok-web/src/main/webapp/history.jsp b/opengrok-web/src/main/webapp/history.jsp index 97deda04873..65ab2c52a72 100644 --- a/opengrok-web/src/main/webapp/history.jsp +++ b/opengrok-web/src/main/webapp/history.jsp @@ -1,6 +1,4 @@ <%-- -$Id$ - CDDL HEADER START The contents of this file are subject to the terms of the @@ -61,13 +59,12 @@ org.opengrok.indexer.web.Util" String primePath = path; Project project = cfg.getProject(); if (project != null) { - SearchHelper searchHelper = cfg.prepareInternalSearch(); /* * N.b. searchHelper.destroy() is called via - * WebappListener.requestDestroyed() on presence of the following - * REQUEST_ATTR. + * WebappListener.requestDestroyed() on presence of an attribute, + * REQUEST_ATTR, set by the following. */ - request.setAttribute(SearchHelper.REQUEST_ATTR, searchHelper); + SearchHelper searchHelper = cfg.prepareInternalSearch(); searchHelper.prepareExec(project); try { @@ -112,7 +109,7 @@ include file="httpheader.jspf" diff --git a/opengrok-web/src/main/webapp/list.jsp b/opengrok-web/src/main/webapp/list.jsp index 1e5aa346dba..f194a3f00a8 100644 --- a/opengrok-web/src/main/webapp/list.jsp +++ b/opengrok-web/src/main/webapp/list.jsp @@ -40,7 +40,6 @@ java.util.TreeSet, org.opengrok.indexer.analysis.AnalyzerGuru, org.opengrok.indexer.analysis.Ctags, org.opengrok.indexer.analysis.Definitions, -org.opengrok.indexer.analysis.AbstractAnalyzer, org.opengrok.indexer.analysis.AbstractAnalyzer.Genre, org.opengrok.indexer.analysis.AnalyzerFactory, org.opengrok.indexer.history.Annotation, @@ -62,6 +61,8 @@ org.opengrok.indexer.web.SearchHelper" final String DUMMY_REVISION = "unknown"; { + resetData(); + // need to set it here since requesting parameters if (request.getCharacterEncoding() == null) { request.setCharacterEncoding("UTF-8"); @@ -108,14 +109,27 @@ final String DUMMY_REVISION = "unknown"; } } - Annotation annotation = cfg.getAnnotation(); - if (annotation != null) { - int r = annotation.getWidestRevision(); - int a = annotation.getWidestAuthor(); - cfg.addHeaderData(""); + // Set just after the redirects above so that the field is defined early. + project = cfg.getProject(); + + boolean isAnnotatableGenre = false; + if (cfg.wantsAnnotation() && !cfg.isDir()) { + prepareExec(cfg); + if (searchHelper.searcher != null) { + genre = searchHelper.searchSingleGenre(cfg.getResourceFile()); + isAnnotatableGenre = Genre.PLAIN.equals(genre); + } + } + if (isAnnotatableGenre) { + Annotation annotation = cfg.getAnnotation(); + if (annotation != null) { + int r = annotation.getWidestRevision(); + int a = annotation.getWidestAuthor(); + cfg.addHeaderData(""); + } } } %><%@include @@ -132,7 +146,6 @@ document.pageReady.push(function() { pageReadyList();}); PageConfig cfg = PageConfig.get(request); String rev = cfg.getRequestedRevision(); - Project project = cfg.getProject(); String navigateWindowEnabled = project != null ? Boolean.toString( project.isNavigateWindowEnabled()) : "false"; @@ -166,20 +179,8 @@ document.pageReady.push(function() { pageReadyList();}); List files = cfg.getResourceFileList(); if (!files.isEmpty()) { List extras = null; - SearchHelper searchHelper = cfg.prepareInternalSearch(); - /* - * N.b. searchHelper.destroy() is called via - * WebappListener.requestDestroyed() on presence of the following - * REQUEST_ATTR. - */ - request.setAttribute(SearchHelper.REQUEST_ATTR, searchHelper); - if (project != null) { - searchHelper.prepareExec(project); - } else { - //noinspection Convert2Diamond - searchHelper.prepareExec(new TreeSet()); - } + prepareExec(cfg); if (searchHelper.searcher != null) { DirectoryExtraReader extraReader = new DirectoryExtraReader(); String primePath = path; @@ -236,23 +237,20 @@ document.pageReady.push(function() { pageReadyList();}); File xrefFile; if (cfg.isLatestRevision(rev) && (xrefFile = cfg.findDataFile()) != null) { - if (cfg.annotate()) { + if (cfg.shouldAnnotate()) { // annotate BufferedInputStream bin = new BufferedInputStream(new FileInputStream(resourceFile)); try { - AnalyzerFactory a = AnalyzerGuru.find(basename); - AbstractAnalyzer.Genre g = AnalyzerGuru.getGenre(a); - if (g == null) { - a = AnalyzerGuru.find(bin); - g = AnalyzerGuru.getGenre(a); - } - if (g == AbstractAnalyzer.Genre.IMAGE) { + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory( + resourceFile, basename, true); + genre = AnalyzerGuru.getGenre(fac); + if (genre == Genre.IMAGE) { %>

Image from Source Repository
<% - } else if ( g == AbstractAnalyzer.Genre.HTML) { + } else if (genre == Genre.HTML) { /** * For backward compatibility, read the OpenGrok-produced * document using the system default charset. @@ -260,7 +258,7 @@ document.pageReady.push(function() { pageReadyList();}); r = new InputStreamReader(bin); // dumpXref() is also useful here for translating links. Util.dumpXref(out, r, request.getContextPath()); - } else if (g == AbstractAnalyzer.Genre.PLAIN) { + } else if (genre == Genre.PLAIN) { %>
<%
@@ -271,7 +269,7 @@ document.pageReady.push(function() { pageReadyList();});
                         // SRCROOT is read with UTF-8 as a default.
                         r = IOUtils.createBOMStrippedReader(bin,
                             StandardCharsets.UTF_8.name());
-                        AnalyzerGuru.writeDumpedXref(request.getContextPath(), a,
+                        AnalyzerGuru.writeDumpedXref(request.getContextPath(), fac,
                                 r, out, defs, annotation, project);
     %>
<% @@ -302,10 +300,20 @@ Click download <%= basename %><% } } else { // requesting a previous revision or needed to generate xref on the fly (economy mode). - AnalyzerFactory a = AnalyzerGuru.find(basename); - Genre g = AnalyzerGuru.getGenre(a); + + AnalyzerFactory fac = rev.equals(DUMMY_REVISION) ? AnalyzerGuru.getAnalyzerFactory( + resourceFile, path, true) : AnalyzerGuru.find(basename); + if (genre == null) { + prepareExec(cfg); + if (searchHelper.searcher != null) { + genre = searchHelper.searchSingleGenre(resourceFile); + } + } + if (genre == null) { + genre = AnalyzerGuru.getGenre(fac); + } String error = null; - if (g == Genre.PLAIN || g == Genre.HTML || g == null) { + if (genre == Genre.PLAIN || genre == Genre.HTML || genre == null) { InputStream in = null; File tempf = null; try { @@ -331,21 +339,21 @@ Click download <%= basename %><% } if (in != null) { try { - if (g == null) { - a = AnalyzerGuru.find(in, basename); - g = AnalyzerGuru.getGenre(a); + if (fac == null || genre == null) { + fac = AnalyzerGuru.getAnalyzerFactory(in, basename); + genre = AnalyzerGuru.getGenre(fac); } - if (g == AbstractAnalyzer.Genre.DATA || g == AbstractAnalyzer.Genre.XREFABLE || g == null) { + if (genre == Genre.DATA || genre == Genre.XREFABLE || genre == null) { %> <% } else { %>
<%
-                            if (g == AbstractAnalyzer.Genre.PLAIN) {
+                            if (genre == Genre.PLAIN) {
                                 Definitions defs = null;
                                 ObjectPool ctagsPool = cfg.getEnv().
                                         getIndexerParallelizer().getCtagsPool();
@@ -377,31 +385,23 @@ Click download <%= basename %><%
                                 Annotation annotation = cfg.getAnnotation();
                                 //not needed yet
                                 //annotation.writeTooltipMap(out);
-                                // SRCROOT is read with UTF-8 as a default.
+                                // sourceRoot is read with UTF-8 as a default.
                                 r = IOUtils.createBOMStrippedReader(in,
-                                    StandardCharsets.UTF_8.name());
-                                AnalyzerGuru.writeDumpedXref(
-                                        request.getContextPath(),
-                                        a, r, out,
+                                        StandardCharsets.UTF_8.name());
+                                AnalyzerGuru.writeDumpedXref(request.getContextPath(), fac, r, out,
                                         defs, annotation, project);
-                            } else if (g == AbstractAnalyzer.Genre.IMAGE) {
+                            } else if (genre == Genre.IMAGE) {
         %>
<%
-                            } else if (g == AbstractAnalyzer.Genre.HTML) {
-                                /**
-                                 * For backward compatibility, read the
-                                 * OpenGrok-produced document using the system
-                                 * default charset.
-                                 */
-                                r = new InputStreamReader(in);
-                                /**
-                                 * dumpXref() is also useful here for
-                                 * translating links.
-                                 */
-                                Util.dumpXref(out, r, request.getContextPath());
+                            } else if (genre == Genre.HTML) {
+                                // sourceRoot is read with UTF-8 as a default.
+                                r = IOUtils.createBOMStrippedReader(in,
+                                        StandardCharsets.UTF_8.name());
+                                AnalyzerGuru.writeDumpedXref(request.getContextPath(), fac, r, out,
+                                        null, null, project);
                             } else {
-        %>Download binary file, ?<%= QueryParameters.REVISION_PARAM_EQ %>
 <%= Util.URIEncode(rev) %>"><%= basename %><%
                             }
                         }
@@ -430,7 +430,7 @@ Click download <%= basename %><%
     

<%= error %>

<% } } - } else if (g == AbstractAnalyzer.Genre.IMAGE) { + } else if (genre == Genre.IMAGE) { %>
download <%= basename %><% } else { %> <% } @@ -457,8 +457,16 @@ Click download <%= basename %><% %>
<% } else { + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(resourceFile, path, true); + if (Genre.DATA.equals(fac.getGenre())) { + %> +
+ Download file, <%= basename %> +
<% + } else { %>

Failed to get xref file

<% + } } } } @@ -468,3 +476,32 @@ Click download <%= basename %><% include file="foot.jspf" %> +<%! + private Project project; + private SearchHelper searchHelper; + private Genre genre; + + private void resetData() { + project = null; + searchHelper = null; + genre = null; + } + + private void prepareExec(PageConfig cfg) { + if (searchHelper == null) { + /* + * N.b. searchHelper.destroy() is called via + * WebappListener.requestDestroyed() on presence of an attribute, + * REQUEST_ATTR, set by the following. + */ + searchHelper = cfg.prepareInternalSearch(); + + if (project != null) { + searchHelper.prepareExec(project); + } else { + //noinspection Convert2Diamond + searchHelper.prepareExec(new TreeSet()); + } + } + } +%> diff --git a/opengrok-web/src/main/webapp/mast.jsp b/opengrok-web/src/main/webapp/mast.jsp index 8d3463f9031..207ed25c991 100644 --- a/opengrok-web/src/main/webapp/mast.jsp +++ b/opengrok-web/src/main/webapp/mast.jsp @@ -1,6 +1,4 @@ <%-- -$Id$ - CDDL HEADER START The contents of this file are subject to the terms of the @@ -20,12 +18,9 @@ CDDL HEADER END Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. Portions Copyright 2011 Jens Elkner. -Portions Copyright (c) 2018, Chris Fraire . - ---%><%-- +Portions Copyright (c) 2018, 2020, Chris Fraire . After include you are here: /body/div#page/div#content/ - --%> <%@page contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@page import="org.opengrok.indexer.web.messages.MessagesContainer"%> @@ -71,7 +66,7 @@ include file="httpheader.jspf" %> diff --git a/opengrok-web/src/main/webapp/minisearch.jspf b/opengrok-web/src/main/webapp/minisearch.jspf index 044a0887ccb..85758708b82 100644 --- a/opengrok-web/src/main/webapp/minisearch.jspf +++ b/opengrok-web/src/main/webapp/minisearch.jspf @@ -45,7 +45,7 @@ org.opengrok.indexer.web.Util"%><% } if (!cfg.hasAnnotations() /* || cfg.getPrefix() == Prefix.HIST_S */ ) { %>
  • Annotate
  • <% - } else if (cfg.annotate()) { + } else if (cfg.shouldAnnotate()) { %>