Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
languagejava
firstline93
titledownloadStream method
linenumberstrue
collapsetrue
  private static void downloadStream(SourceItem item) {
    System.out.println("Item: "+item.getName());
    try {
      InputStream is = item.getContentStream();
      if (is != null) {
        FileOutputStream fos = new FileOutputStream(new File("output/"+item.getName()));
        copyStream(is, fos);
        fos.close();
        is.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }


Step 4

...

The Scanner class:

This is where the crawl gets done when calling the processQueue() method the following happens:

  1. The current items get moved to another queue to be iterated
  2. For each item in the queue
    1. It checks if the item is a container
    2. It calls the populate method on the items that needs to be processed
    3. It calls the fetcher to process the item
    4. If the item was a container, call the scan() method with it (this adds more items into the queue)
Code Block
languagejava
firstline179
titleScanner class
Code Block
languagejava
firstline179
titleScanner class
linenumberstrue
collapsetruetrue
  static class Scanner implements ScanListener {

    /**
     * The queue that receives all the new items discovered
     */
    ArrayList<SourceItem> queue;
    
    /**
     * Temporary queue used for iterate the original queue
     */
    ArrayList<SourceItem> safeQueue;
    private RepositoryAccessProvider rap;
    private SourceInfo info;
    private FetchURL fetcher;
    
    public Scanner(RepositoryAccessProvider rap, SourceInfo info) throws AspireException {
      this.rap = rap;
      this.info = info;
      queue = new ArrayList<SourceItem>();
      safeQueue = new ArrayList<SourceItem>();
      fetcher = new StandaloneFetchURL(rap);
      fetcher.initialize(emptyDom);
    }
    
    public void close() {
      fetcher.close();
    }

    @Override
    public void addItem(SourceItem item) {
      //This gets called when the RAP scanner adds an item to crawl
      queue.add(item);
    }

    @Override
    public void addItems(List<SourceItem> items) {
      //This gets called when the RAP scanner adds items to crawl
      queue.addAll(items);
    }
    
    public void processQueue(Consumer<SourceItem> processor) throws AspireException {
      
      RepositoryConnection conn = rap.newConnection(info);
      
      //Move the items from the original queue into the safeQueue
      //And clear the orignal
      safeQueue.clear();
      safeQueue.addAll(queue);
      queue.clear();
      
      for (SourceItem item : safeQueue) {
        boolean container = false;
        
        if (rap.isContainer(item, conn)) {
          container = true;
        }
        
        if (info.indexContainers() || !container) {
          rap.populate(item, info, conn);
          //Call the fetcher
          Job job = JobFactory.newInstance(item.generateJobDocument());
          job.put("sourceInfo", info);
          job.put("crawlController", info.getCrawlController());
          fetcher.process(job);
          item.setContentStream((InputStream) job.get("contentStream"));
        }
        
        if (container) {
          rap.scan(item, info, conn, this);
        }
        processor.accept(item);
      }
      safeQueue.clear();
    }
    
    public boolean isQueueEmpty() {
      return queue.size()+safeQueue.size() == 0;
    }
  }

Step 5

...

Fetcher class

This class is only required to extend the getComponent method to return the correct RAP object.

Code Block
languagejava
firstline259
titleFetcher class
linenumberstrue
collapsetrue
  private static class StandaloneFetchURL extends FetchURL {

    public static final String RAP = "rap";
    RepositoryAccessProvider rap;
    
    public StandaloneFetchURL(RepositoryAccessProvider rap) {
      this.rap = rap;
    }
    
    @Override
    public Component getComponent(String path) {
      if (RAP.equals(path)) {
        return (Component)rap;
      }
      return null;
    }
  } 

StandaloneCrawlController class:

This class is only for extending the getNoSQLConnection method needed by some components to return a dummy NoSQLConnection object.

Code Block
languagejava
firstline277
titleStandaloneCrawlController class
linenumberstrue
collapsetrue
  private static class StandaloneCrawlController extends CrawlControllerImpl {

    RepositoryAccessProvider rap;
    public StandaloneCrawlController(RepositoryAccessProvider rap) {
      this.rap = rap;
    }

    @Override
    public RepositoryAccessProvider getRAP() {
      return rap;
    }

    @Override
    public NoSQLConnection getNoSQLConnection(String name, AspireObject properties) {
      return new EmptyNoSQLConnection();
    }
  }

EmptyNoSQLConnection class:

Like its name suggests, it is a dummy NoSQLConnection, it doesn't really do anything.

Code Block
languagejava
firstline16
titleEmptyNoSQLConnection
linenumberstrue
collapsetrue
public class EmptyNoSQLConnection implements NoSQLConnection {

      @Override
      public String getDatabase() {
        return null;
      }

      @Override
      public String getCollection() {
        return null;
      }

      @Override
      public void add(AspireObject item) throws AspireException {
      }

      @Override
      public void update(AspireObject item, String id) throws AspireException {
      }

      @Override
      public void update(AspireObject item, AspireObject filter)
          throws AspireException {
      }

      @Override
      public void updateAll(AspireObject item, AspireObject filter)
          throws AspireException {
      }

      @Override
      public void updateOrAdd(AspireObject item, AspireObject filter)
          throws AspireException {
      }

      @Override
      public boolean delete(String id) throws AspireException {
        return false;
      }

      @Override
      public boolean delete(AspireObject filter) throws AspireException {
        return false;
      }

      @Override
      public boolean deleteAll(AspireObject filter) throws AspireException {
        return false;
      }

      @Override
      public AspireObject getOneAndUpdate(AspireObject filter,
          AspireObject update) throws AspireException {
        return null;
      }

      @Override
      public AspireObject getOne(AspireObject filter) throws AspireException {
        return null;
      }

      @Override
      public NoSQLIterable<AspireObject> getAll(AspireObject filter)
          throws AspireException {
        return null;
      }

      @Override
      public NoSQLIterable<AspireObject> getAll(AspireObject filter, int skip)
          throws AspireException {
        return null;
      }

      @Override
      public NoSQLIterable<AspireObject> getAll() throws AspireException  static class Scanner implements ScanListener {

    /**
     * The queue that receives all the new items discovered
     */
    ArrayList<SourceItem> queue;
    
    /**
     * Temporary queue used for iterate the original queue
     */
    ArrayList<SourceItem> safeQueue;
    private RepositoryAccessProvider rap;
    private SourceInfo info;
    private FetchURL fetcher;
    
    public Scanner(RepositoryAccessProvider rap, SourceInfo info) throws AspireException {
      this.rap = rap;
      this.info = info;
      queue = new ArrayList<SourceItem>();
      safeQueue = new ArrayList<SourceItem>();
      fetcher = new StandaloneFetchURL(rap);
      fetcher.initialize(emptyDom);
    }
    
    public void close() {
      fetcher.close();
    }

    @Override
    public void addItem(SourceItem item) {
      //This gets called when the RAP scanner adds an item to crawl
      queue.add(item);
    }

    @Override
    public void addItems(List<SourceItem> items) {
      //This gets calledreturn whennull;
 the RAP scanner adds items to crawl}

      queue.addAll(items);@Override
    }
  public NoSQLIterable<AspireObject> getAll(int skip)
    public void processQueue(Consumer<SourceItem> processor) throws AspireException {
    throws AspireException {
      RepositoryConnection conn = rap.newConnection(info)return null;
      }

      //Move the items from the original queue into the safeQueue@Override
      public long size() throws AspireException {
      //And clear thereturn orignal0;
      safeQueue.clear();}

      safeQueue.addAll(queue);@Override
      queue.clear();
   public long size(AspireObject filter) throws AspireException {
    
    return 0;
 for (SourceItem item : safeQueue) {}

      @Override
  boolean container = false;
 public void clear()   throws AspireException {
      }

   if (rap.isContainer(item, conn)) {@Override
      public void close() throws container = true;
  AspireException {
      }

        @Override
      public AspireObject if (info.indexContainers() || !container) {
   getAspireObject(Object obj) throws AspireException {
       rap.populate(item, info, conn) return null;
      }

    //Call the fetcher@Override
      public    Job job = JobFactory.newInstance(item.generateJobDocument());void flush() {
       }

   job.put("sourceInfo", info);
   @Override
      public void job.put("crawlController", info.getCrawlController());setBulkTimeout(long timeout) {
      }

    fetcher.process(job);
   @Override
       item.setContentStream((InputStream) job.get("contentStream"));
  public void setBulkSize(int size) {
      }

        @Override
      public void if (containeruseBulk(boolean useBulk) {
      }

      @Override
      public AspireObject rap.scan(item, info, conn, this);
getOneAndUpdateOrAdd(AspireObject update,
          AspireObject filter) throws AspireException }{
         processor.accept(item)return null;
      }

      safeQueue.clear();
@Override
       }
public AspireObject getOneAndDelete(AspireObject filter)
     
    public booleanthrows isQueueEmpty()AspireException {
      return queue.size()+safeQueue.size() == 0 return null;
    }
  }
}



For Legacy connectors standalone crawls see:

...