Hi Tim,
You will want to test your regex with a Java Regex tester see http://www.freeformatter.com/java-regex-tester.html however Im not sure if that option will achieve what you wish.
Did you want to use the RSS feed as a URL crawl list? If so you can abstract this script to create a list of start urls to index. (Just run as a pre_gather_command)
/*
Generate a list of start URLs from XML data f
Run using /opt/funnelback/tools/groovy/bin/groovy -cp "$SEARCH_HOME/lib/java/all/*" GenerateStartUrlsFromXml.groovy
Collection.cfg settings;
GenerateStartUrlsFromXml.website=www.website.com
GenerateStartUrlsFromXml.file=$SEARCH_HOME/conf/$COLLECTION_NAME/@downloads/records.xml
@author Robert Prib (rprib@funnelback.com)
*/
import groovy.util.XmlParser;
import groovy.util.XmlSlurper;
import groovy.xml.XmlUtil;
import com.funnelback.common.config.*;
public class GenerateStartUrlsFromXml {
public String SEARCH_HOME = "/opt/funnelback";
public String WEBSITE_DOMAIN;
public String COLLECTION;
public List<String> EXCLUDE_PATTERNS = [];
/*
Main class
@param (string) collection - Name of search collection to run on.
*/
GenerateStartUrlsFromXml(String collection, String searchHome){
if( collection) this.COLLECTION = collection;
if( searchHome ) this.SEARCH_HOME = searchHome;
if( !this.COLLECTION ) return;
//Get collection config
def collectionConfig = new NoOptionsConfig(new File(this.SEARCH_HOME), this.COLLECTION);
this.EXCLUDE_PATTERNS = collectionConfig.value("exclude_patterns").tokenize(",");
this.WEBSITE_DOMAIN = collectionConfig.value("GenerateStartUrlsFromXml.website");
if( !this.WEBSITE_DOMAIN ) return;
//Setup files reading for read/writing
File fileStartUrls = new File("${this.SEARCH_HOME}/conf/${this.COLLECTION}/collection.cfg.start.urls");
File fileStartUrlsGenerated = new File("${this.SEARCH_HOME}/conf/${this.COLLECTION}/collection.cfg.start.urls.generated");
if( ! (fileStartUrlsGenerated.exists()) ) fileStartUrlsGenerated.createNewFile();
//Clear file before beginging. &
fileStartUrlsGenerated.setText("");
//Set of start URLs and add in configured start URls from collection.cfg.start.urls
if( fileStartUrls.exists() ) fileStartUrlsGenerated.setText( fileStartUrls.text + "\n");
//Process File <-- Enter location of your file to process
processFile(new File("${this.SEARCH_HOME}/conf/${this.COLLECTION}/@downloads/records.xml"), fileStartUrlsGenerated);
}
/*
Data file to process for records
*/
public void processFile(File fileXmlRecords,File fileStartUrlsGenerated){
//Read through XML file using a stream as file is to large to fit into memory
fileXmlRecords.withReader { reader ->
String stringRecord = "";
//Pass a single record at a time to the xmlslurpler to be memory effiecient.
reader.eachLine({ line ->
stringRecord = stringRecord + line.replaceAll("<\\?xml version='1.0' encoding='UTF-8'\\?>","").replaceAll(/<\/*RECORDS>/,"");
if( line.contains("</RECORD>") ) {
parseXmlRecord(stringRecord, fileStartUrlsGenerated);
stringRecord = "";
}
});
}
}
/*
Write start urls to file from XML Records
*/
public void parseXmlRecord(String xmlText, File file){
def record = new XmlSlurper().parseText( xmlText )
def url = constructUrl(record);
if( testExcludeUrl(url) ) file.append("${url}\n");
}
/*
Find all the exclude urls set in collection.cfg and test if URL passes. (exclude_patterns=..)
Returns false if matches an exclude URL.
*/
public boolean testExcludeUrl(url){
boolean test = true;
if(!url) return false;
this.EXCLUDE_PATTERNS.forEach({ pattern ->
if( url.contains( pattern ) ){
test = false;
return true;
}
})
return test;
}
/*
Construct Url of Record from data
@param (GPathResult) record - XML node to process
*/
public String constructUrl(record){
String fileName = record.PROP.find { node ->
node.@NAME == "FILE_LEAF_REF";
};
//URL encode filename as some of them have spaces in them.
fileName = java.net.URLEncoder.encode(fileName, "UTF-8").replaceAll("\\+", "%20");
String path = record.PROP.find { node ->
node.@NAME == "RELATIVE_PATH";
};
path = path.replaceAll(" ", "%20");
String productPath = record.PROP.find { node ->
node.@NAME == "PRODUCT_URL";
};
if( productPath ) return "${WEBSITE_DOMAIN}/${productPath}";
if( !fileName || !path ) return "";
return "${WEBSITE_DOMAIN}/cs/${path}/${fileName}";
}
static void main(String[] args){
String collection;
String searchHome;
if(args.length >= 1) collection = args[0];
if(args.length >= 2) searchHome = args[1];
def generateStartUrlsFromXml = new GenerateStartUrlsFromXml(collection, searchHome);
}
}