Hi Tim,
You will want to test your regex with a Java Regex tester see http://www.freeformatter.com/java-regex-tester.html however Im not sure if that option will achieve what you wish.
Did you want to use the RSS feed as a URL crawl list? If so you can abstract this script to create a list of start urls to index. (Just run as a pre_gather_command)
Generate a list of start URLs from XML data f
Run using /opt/funnelback/tools/groovy/bin/groovy -cp "$SEARCH_HOME/lib/java/all/*" GenerateStartUrlsFromXml.groovy
Collection.cfg settings;
@author Robert Prib (rprib@funnelback.com)
import groovy.util.XmlParser;
import groovy.util.XmlSlurper;
import groovy.xml.XmlUtil;
import com.funnelback.common.config.*;
public class GenerateStartUrlsFromXml {
public String SEARCH_HOME = "/opt/funnelback";
public String WEBSITE_DOMAIN;
public String COLLECTION;
public List<String> EXCLUDE_PATTERNS = [];
Main class
@param (string) collection - Name of search collection to run on.
GenerateStartUrlsFromXml(String collection, String searchHome){
if( collection) this.COLLECTION = collection;
if( searchHome ) this.SEARCH_HOME = searchHome;
if( !this.COLLECTION ) return;
//Get collection config
def collectionConfig = new NoOptionsConfig(new File(this.SEARCH_HOME), this.COLLECTION);
this.EXCLUDE_PATTERNS = collectionConfig.value("exclude_patterns").tokenize(",");
this.WEBSITE_DOMAIN = collectionConfig.value("GenerateStartUrlsFromXml.website");
if( !this.WEBSITE_DOMAIN ) return;
//Setup files reading for read/writing
File fileStartUrls = new File("${this.SEARCH_HOME}/conf/${this.COLLECTION}/collection.cfg.start.urls");
File fileStartUrlsGenerated = new File("${this.SEARCH_HOME}/conf/${this.COLLECTION}/collection.cfg.start.urls.generated");
if( ! (fileStartUrlsGenerated.exists()) ) fileStartUrlsGenerated.createNewFile();
//Clear file before beginging. &
//Set of start URLs and add in configured start URls from collection.cfg.start.urls
if( fileStartUrls.exists() ) fileStartUrlsGenerated.setText( fileStartUrls.text + "\n");
//Process File <-- Enter location of your file to process
processFile(new File("${this.SEARCH_HOME}/conf/${this.COLLECTION}/@downloads/records.xml"), fileStartUrlsGenerated);
Data file to process for records
public void processFile(File fileXmlRecords,File fileStartUrlsGenerated){
//Read through XML file using a stream as file is to large to fit into memory
fileXmlRecords.withReader { reader ->
String stringRecord = "";
//Pass a single record at a time to the xmlslurpler to be memory effiecient.
reader.eachLine({ line ->
stringRecord = stringRecord + line.replaceAll("<\\?xml version='1.0' encoding='UTF-8'\\?>","").replaceAll(/<\/*RECORDS>/,"");
if( line.contains("</RECORD>") ) {
parseXmlRecord(stringRecord, fileStartUrlsGenerated);
stringRecord = "";
Write start urls to file from XML Records
public void parseXmlRecord(String xmlText, File file){
def record = new XmlSlurper().parseText( xmlText )
def url = constructUrl(record);
if( testExcludeUrl(url) ) file.append("${url}\n");
Find all the exclude urls set in collection.cfg and test if URL passes. (exclude_patterns=..)
Returns false if matches an exclude URL.
public boolean testExcludeUrl(url){
boolean test = true;
if(!url) return false;
this.EXCLUDE_PATTERNS.forEach({ pattern ->
if( url.contains( pattern ) ){
test = false;
return true;
return test;
Construct Url of Record from data
@param (GPathResult) record - XML node to process
public String constructUrl(record){
String fileName = record.PROP.find { node ->
node.@NAME == "FILE_LEAF_REF";
//URL encode filename as some of them have spaces in them.
fileName = java.net.URLEncoder.encode(fileName, "UTF-8").replaceAll("\\+", "%20");
String path = record.PROP.find { node ->
path = path.replaceAll(" ", "%20");
String productPath = record.PROP.find { node ->
node.@NAME == "PRODUCT_URL";
if( productPath ) return "${WEBSITE_DOMAIN}/${productPath}";
if( !fileName || !path ) return "";
return "${WEBSITE_DOMAIN}/cs/${path}/${fileName}";
static void main(String[] args){
String collection;
String searchHome;
if(args.length >= 1) collection = args[0];
if(args.length >= 2) searchHome = args[1];
def generateStartUrlsFromXml = new GenerateStartUrlsFromXml(collection, searchHome);