View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   */ 
19  package org.apache.rat.document.impl.guesser;
20  
21  import java.io.IOException;
22  import java.io.Reader;
23  import java.util.Locale;
24  
25  import org.apache.rat.api.Document;
26  
27  /**
28   * TODO: factor into MIME guesser and MIME->binary guesser
29   */
30  public class BinaryGuesser {
31  
32      private static boolean isBinaryDocument(Document document) {
33          boolean result = false;
34          Reader reader = null;
35          try
36          {
37              reader = document.reader();
38              result = isBinary(reader);
39          }
40          catch (IOException e)
41          {
42              result = false;
43          }
44          finally
45          {
46              try
47              {
48                  if (reader != null)
49                  {
50                      reader.close();
51                  }
52              }
53              catch (IOException e)
54              {
55                  // SWALLOW
56              }   
57          }
58          return result;
59      }
60      
61      /**
62       * Do the first few bytes of the stream hint at a binary file?
63       */
64      public static boolean isBinary(Reader in) {
65          boolean result = false;
66          char[] taste = new char[100];
67          try {
68              int bytesRead = in.read(taste);
69              if (bytesRead > 0) {
70                  int highBytes = 0;
71                  for (int i=0;i<bytesRead;i++) {
72                      if (taste[i] > BinaryGuesser.NON_ASCII_THREASHOLD
73                          || taste[i] <= BinaryGuesser.ASCII_CHAR_THREASHOLD) {
74                          highBytes++;
75                      }
76                  }
77                  if (highBytes * BinaryGuesser.HIGH_BYTES_RATIO
78                      > bytesRead * BinaryGuesser.TOTAL_READ_RATIO) {
79                      result = true;
80                  }
81              }
82          } catch (IOException e) {
83              // SWALLOW 
84          }
85          return result;
86      }
87  
88      public static final boolean isBinaryData(final String name) {
89          return extensionMatches(name, DATA_EXTENSIONS);
90      }
91  
92      /**
93       * Is a file by that name a known non-binary file?
94       */
95      public static final boolean isNonBinary(final String name) {
96          if (name == null) {return false;}
97          return extensionMatches(name.toUpperCase(Locale.US),
98                                  BinaryGuesser.NON_BINARY_EXTENSIONS);
99      }
100 
101     public static final boolean isExecutable(final String name) {
102         return name.equals(BinaryGuesser.JAVA) || extensionMatches(name, EXE_EXTENSIONS)
103             || containsExtension(name, EXE_EXTENSIONS);
104     }
105 
106     public static boolean containsExtension(final String name,
107                                              final String[] exts) {
108         boolean result = false;
109         for (int i = 0; !result && i < exts.length; i++) {
110             result = name.indexOf("." + exts[i] + ".") >= 0;
111         }
112         return result;
113     }
114 
115     public static boolean extensionMatches(final String name,
116                                             final String[] exts) {
117         boolean result = false;
118         for (int i = 0; !result && i < exts.length; i++) {
119             result = name.endsWith("." + exts[i]);
120         }
121         return result;
122     }
123 
124     public static boolean isBytecode(final String name) {
125         return BinaryGuesser.extensionMatches(name, BYTECODE_EXTENSIONS);
126     }
127 
128     public static final boolean isImage(final String name) {
129         return BinaryGuesser.extensionMatches(name, IMAGE_EXTENSIONS);
130     }
131 
132     public static final boolean isKeystore(final String name) {
133         return BinaryGuesser.extensionMatches(name, KEYSTORE_EXTENSIONS);
134     }
135     
136     /**
137      * Is a file by that name a known binary file?
138      */
139     public static final boolean isBinary(final String name) {
140         if (name == null) {return false;}
141         String normalisedName = GuessUtils.normalise(name);
142         return BinaryGuesser.JAR_MANIFEST.equals(name) || BinaryGuesser.isImage(normalisedName)
143             || BinaryGuesser.isKeystore(normalisedName) || BinaryGuesser.isBytecode(normalisedName)
144             || BinaryGuesser.isBinaryData(normalisedName) || BinaryGuesser.isExecutable(normalisedName);
145     }
146 
147     public static final String[] DATA_EXTENSIONS = {
148         "DAT", "DOC",
149         "NCB", "IDB",
150         "SUO", "XCF",
151         "RAJ", "CERT",
152         "KS", "TS",
153         "ODP",
154     };
155     public static final String[] EXE_EXTENSIONS = {
156         "EXE", "DLL",
157         "LIB", "SO",
158         "A", "EXP",
159     };
160     public static final String[] KEYSTORE_EXTENSIONS = {
161         "JKS", "KEYSTORE", "PEM", "CRL"
162     };
163     public static final String[] IMAGE_EXTENSIONS = {
164         "PNG", "PDF",
165         "GIF", "GIFF",
166         "TIF", "TIFF",
167         "JPG", "JPEG",
168         "ICO", "ICNS",
169     };
170     public static final String[] BYTECODE_EXTENSIONS = {
171         "CLASS", "PYD",
172         "OBJ", "PYC",
173     };
174     
175     /**
176      * Based on http://www.apache.org/dev/svn-eol-style.txt
177      */
178     public static final String[] NON_BINARY_EXTENSIONS = {
179         "AART",
180         "AC",
181         "AM",
182         "BAT",
183         "C",
184         "CAT",
185         "CGI",
186         "CLASSPATH",
187         "CMD",
188         "CONFIG",
189         "CPP",
190         "CSS",
191         "CWIKI",
192         "DATA",
193         "DCL",
194         "DTD",
195         "EGRM",
196         "ENT",
197         "FT", 
198         "FN",
199         "FV", 
200         "GRM",
201         "G",
202         "H",
203         "HTACCESS",
204         "HTML",
205         "IHTML",
206         "IN",
207         "JAVA",
208         "JMX", 
209         "JSP",
210         "JS",
211         "JUNIT",
212         "JX", 
213         "MANIFEST",
214         "M4",
215         "MF",
216         "MF",
217         "META",
218         "MOD",
219         "N3",
220         "PEN",
221         "PL",
222         "PM",
223         "POD",
224         "POM",
225         "PROJECT",
226         "PROPERTIES",
227         "PY",
228         "RB",
229         "RDF",
230         "RNC",
231         "RNG",
232         "RNX",
233         "ROLES",
234         "RSS",
235         "SH",
236         "SQL",
237         "SVG",
238         "TLD",
239         "TXT",
240         "TYPES",
241         "VM",
242         "VSL",
243         "WSDD",
244         "WSDL",
245         "XARGS",
246         "XCAT",
247         "XCONF",
248         "XEGRM",
249         "XGRM",
250         "XLEX",
251         "XLOG",
252         "XMAP",
253         "XML",
254         "XROLES",
255         "XSAMPLES",
256         "XSD",
257         "XSL",
258         "XSLT",
259         "XSP",
260         "XUL",
261         "XWEB",
262         "XWELCOME",
263     };
264     public static final String JAR_MANIFEST = "MANIFEST.MF";
265     public static final String JAVA = "JAVA";
266     public static final int HIGH_BYTES_RATIO = 100;
267     public static final int TOTAL_READ_RATIO = 30;
268     public static final int NON_ASCII_THREASHOLD = 256;
269     public static final int ASCII_CHAR_THREASHOLD = 8;
270 
271     public static final boolean isBinary(final Document document) {
272         // TODO: reimplement the binary test algorithm?
273         // TODO: more efficient to move into standard analysis
274         // TODO: then use binary as default
275         final String name = document.getName();
276         boolean result = isBinary(name);
277         if (!result)
278         {
279             // try a taste
280             result = isBinaryDocument(document);
281         }
282         return result;
283     }
284 
285 
286 
287 }