Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
languagejava
firstline52
titleCrawl Initialization
linenumberstrue
collapsetrue
public static void main(String[] args) {  
  
    //Get the scan properties from the arguments
    String scanPropertiesFile = args[0];
    
    //Instantiate a new RepositoryAccessProvider from the connector implementation
    JobFactory.initializeForStandAloneUsage();
    
    ComponentImpl component = new SP2013RAP();
    component.initialize(emptyDom);
    RepositoryAccessProvider rap = (RepositoryAccessProvider) component;
    
    //Create a CrawlControllerImpl, as some connectors depends on it
    StandaloneCrawlController crawlCtrlImpl = new StandaloneCrawlController(rap);
    
    //Load the content-source.xml configuration into an AspireObject
    AspireObject scanProps = new AspireObject("doc");
    AspireObject scanPropsFile = AspireObject.createFromXML(new File(scanPropertiesFile));
    scanProps.add(scanPropsFile);
    
    //Create and initialize a new SourceInfo from the RAP
    SourceInfo info = rap.newSourceInfo(scanPropsFile);
    info.initialize(scanProps);
    info.setCrawlController(crawlCtrlImpl);
    crawlCtrlImpl.setSourceInfo(info);

Step 3

...

Start the crawl:

Code Block
languagejava
firstline78
titleStart crawl call
linenumberstrue
collapsetrue
    //execute the crawl
    crawl(rap, info, Main::downloadStream);


the crawl() method:

Code Block
languagejava
firstline164
titlecrawl method
linenumberstrue
collapsetrue
  private static void crawl(RepositoryAccessProvider rap, SourceInfo info, Consumer<SourceItem> processor) throws AspireException {
    //The ScanListener which maintain the local processQueue and listen for new items to crawl
    Scanner scanner = new Scanner(rap, info);
    
    //Create the crawlRoot item to initialize the crawl from the RAP
    SourceItem crawlRoot = new SourceItem("crawlRoot");
    rap.processCrawlRoot(crawlRoot, info, scanner);
    
    //Crawls the local processQueue while it is not empty
    // when empty, it means the crawl finished
    do {
      scanner.processQueue(processor);
    } while (!scanner.isQueueEmpty());
  }


downloadStream() method:

Code Block
languagejava
firstline93
titledownloadStream method
linenumberstrue
collapsetrue
  private static void downloadStream(SourceItem item) {
    System.out.println("Item: "+item.getName());
    try {
      InputStream is = item.getContentStream();
      if (is != null) {
        FileOutputStream fos = new FileOutputStream(new File("output/"+item.getName()));
        copyStream(is, fos);
        fos.close();
        is.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }



For Legacy connectors standalone crawls see:

...