Tuesday, March 10, 2015

Java: RegEx: WebSpider

In this tutorial we will create a basis for a WebSpider program that fetches all links from the given Web page, checks which have extensions of interest and downloads these files.

The result of the app will be a folder with Google Earth map overlays:


/Users/uki/Desktop/KMZ
├── 1.kmz
├── 10.kmz
├── 11.kmz
├── 12.kmz
├── 13.kmz
├── 14.kmz
├── 15.kmz
├── 16.kmz
├── 17.kmz

├── 18.kmz


We will use basic java networking classes:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;

and regular expressions:

ublic class RegExConstants {

   /**    * @param extensions    * @return    *  (                -- start of main grouping    *     [^\s]+        -- must contains one, or many strings (but not white space)    *     (             -- start of extension grouping    *        \.         -- existence of a dot, eg.: .kmz    *        (?i)       -- NOT case sensitive for the next group    *        (kmz|kml)  -- kmz OR kml strings    *     )$            -- should exist on the end    *  )                -- end of main grouping    */   public static String fileExtensionPattern(String[] extensions) {

      StringBuilder sb = new StringBuilder("");

      if (extensions.length > 0) {
         int n = 0;
         for (String extension : extensions) {
            if (n > 0) {
               sb.append("|"); // append OR            }
            sb.append(extension);
         }
      }
      String pattern = "([^\\s]+(\\.(?i)(" + sb.toString() + "))$)";
      System.out.println("fileExtensionPattern: " + pattern);
      return pattern;
   }

   public static final String anchorTagPattern = "<a *href=\"(.+?)</a>";
   public static final String urlPattern = "(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
}