Page History
...
Code Block | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
| ||||||||||
private static void downloadStream(SourceItem item) { System.out.println("Item: "+item.getName()); try { InputStream is = item.getContentStream(); if (is != null) { FileOutputStream fos = new FileOutputStream(new File("output/"+item.getName())); copyStream(is, fos); fos.close(); is.close(); } } catch (Exception e) { e.printStackTrace(); } } |
Step 4
...
The Scanner class:
This is where the crawl gets done when calling the processQueue() method the following happens:
- The current items get moved to another queue to be iterated
- For each item in the queue
- It checks if the item is a container
- It calls the populate method on the items that needs to be processed
- It calls the fetcher to process the item
- If the item was a container, call the scan() method with it (this adds more items into the queue)
Code Block | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
| ||||||||||||
Code Block | ||||||||||||
| ||||||||||||
static class Scanner implements ScanListener {
/**
* The queue that receives all the new items discovered
*/
ArrayList<SourceItem> queue;
/**
* Temporary queue used for iterate the original queue
*/
ArrayList<SourceItem> safeQueue;
private RepositoryAccessProvider rap;
private SourceInfo info;
private FetchURL fetcher;
public Scanner(RepositoryAccessProvider rap, SourceInfo info) throws AspireException {
this.rap = rap;
this.info = info;
queue = new ArrayList<SourceItem>();
safeQueue = new ArrayList<SourceItem>();
fetcher = new StandaloneFetchURL(rap);
fetcher.initialize(emptyDom);
}
public void close() {
fetcher.close();
}
@Override
public void addItem(SourceItem item) {
//This gets called when the RAP scanner adds an item to crawl
queue.add(item);
}
@Override
public void addItems(List<SourceItem> items) {
//This gets called when the RAP scanner adds items to crawl
queue.addAll(items);
}
public void processQueue(Consumer<SourceItem> processor) throws AspireException {
RepositoryConnection conn = rap.newConnection(info);
//Move the items from the original queue into the safeQueue
//And clear the orignal
safeQueue.clear();
safeQueue.addAll(queue);
queue.clear();
for (SourceItem item : safeQueue) {
boolean container = false;
if (rap.isContainer(item, conn)) {
container = true;
}
if (info.indexContainers() || !container) {
rap.populate(item, info, conn);
//Call the fetcher
Job job = JobFactory.newInstance(item.generateJobDocument());
job.put("sourceInfo", info);
job.put("crawlController", info.getCrawlController());
fetcher.process(job);
item.setContentStream((InputStream) job.get("contentStream"));
}
if (container) {
rap.scan(item, info, conn, this);
}
processor.accept(item);
}
safeQueue.clear();
}
public boolean isQueueEmpty() {
return queue.size()+safeQueue.size() == 0;
}
}
|
Step 5
...
Fetcher class
This class is only required to extend the getComponent method to return the correct RAP object.
Code Block | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
| ||||||||||
private static class StandaloneFetchURL extends FetchURL {
public static final String RAP = "rap";
RepositoryAccessProvider rap;
public StandaloneFetchURL(RepositoryAccessProvider rap) {
this.rap = rap;
}
@Override
public Component getComponent(String path) {
if (RAP.equals(path)) {
return (Component)rap;
}
return null;
}
} |
StandaloneCrawlController class:
This class is only for extending the getNoSQLConnection method needed by some components to return a dummy NoSQLConnection object.
Code Block | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
| ||||||||||
private static class StandaloneCrawlController extends CrawlControllerImpl {
RepositoryAccessProvider rap;
public StandaloneCrawlController(RepositoryAccessProvider rap) {
this.rap = rap;
}
@Override
public RepositoryAccessProvider getRAP() {
return rap;
}
@Override
public NoSQLConnection getNoSQLConnection(String name, AspireObject properties) {
return new EmptyNoSQLConnection();
}
} |
EmptyNoSQLConnection class:
Like its name suggests, it is a dummy NoSQLConnection, it doesn't really do anything.
Code Block | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|
| ||||||||||
public class EmptyNoSQLConnection implements NoSQLConnection { @Override public String getDatabase() { return null; } @Override public String getCollection() { return null; } @Override public void add(AspireObject item) throws AspireException { } @Override public void update(AspireObject item, String id) throws AspireException { } @Override public void update(AspireObject item, AspireObject filter) throws AspireException { } @Override public void updateAll(AspireObject item, AspireObject filter) throws AspireException { } @Override public void updateOrAdd(AspireObject item, AspireObject filter) throws AspireException { } @Override public boolean delete(String id) throws AspireException { return false; } @Override public boolean delete(AspireObject filter) throws AspireException { return false; } @Override public boolean deleteAll(AspireObject filter) throws AspireException { return false; } @Override public AspireObject getOneAndUpdate(AspireObject filter, AspireObject update) throws AspireException { return null; } @Override public AspireObject getOne(AspireObject filter) throws AspireException { return null; } @Override public NoSQLIterable<AspireObject> getAll(AspireObject filter) throws AspireException { return null; } @Override public NoSQLIterable<AspireObject> getAll(AspireObject filter, int skip) throws AspireException { return null; } @Override public NoSQLIterable<AspireObject> getAll() throws AspireException static class Scanner implements ScanListener { /** * The queue that receives all the new items discovered */ ArrayList<SourceItem> queue; /** * Temporary queue used for iterate the original queue */ ArrayList<SourceItem> safeQueue; private RepositoryAccessProvider rap; private SourceInfo info; private FetchURL fetcher; public Scanner(RepositoryAccessProvider rap, SourceInfo info) throws AspireException { this.rap = rap; this.info = info; queue = new ArrayList<SourceItem>(); safeQueue = new ArrayList<SourceItem>(); fetcher = new StandaloneFetchURL(rap); fetcher.initialize(emptyDom); } public void close() { fetcher.close(); } @Override public void addItem(SourceItem item) { //This gets called when the RAP scanner adds an item to crawl queue.add(item); } @Override public void addItems(List<SourceItem> items) { //This gets calledreturn whennull; the RAP scanner adds items to crawl} queue.addAll(items);@Override } public NoSQLIterable<AspireObject> getAll(int skip) public void processQueue(Consumer<SourceItem> processor) throws AspireException { throws AspireException { RepositoryConnection conn = rap.newConnection(info)return null; } //Move the items from the original queue into the safeQueue@Override public long size() throws AspireException { //And clear thereturn orignal0; safeQueue.clear();} safeQueue.addAll(queue);@Override queue.clear(); public long size(AspireObject filter) throws AspireException { return 0; for (SourceItem item : safeQueue) {} @Override boolean container = false; public void clear() throws AspireException { } if (rap.isContainer(item, conn)) {@Override public void close() throws container = true; AspireException { } @Override public AspireObject if (info.indexContainers() || !container) { getAspireObject(Object obj) throws AspireException { rap.populate(item, info, conn) return null; } //Call the fetcher@Override public Job job = JobFactory.newInstance(item.generateJobDocument());void flush() { } job.put("sourceInfo", info); @Override public void job.put("crawlController", info.getCrawlController());setBulkTimeout(long timeout) { } fetcher.process(job); @Override item.setContentStream((InputStream) job.get("contentStream")); public void setBulkSize(int size) { } @Override public void if (containeruseBulk(boolean useBulk) { } @Override public AspireObject rap.scan(item, info, conn, this); getOneAndUpdateOrAdd(AspireObject update, AspireObject filter) throws AspireException }{ processor.accept(item)return null; } safeQueue.clear(); @Override } public AspireObject getOneAndDelete(AspireObject filter) public booleanthrows isQueueEmpty()AspireException { return queue.size()+safeQueue.size() == 0 return null; } } } |
For Legacy connectors standalone crawls see:
...
Overview
Content Tools