public class MongoDBKnowUriFilter extends Object implements org.dice_research.squirrel.data.uri.filter.KnownUriFilter, Cloneable, Closeable, org.dice_research.squirrel.deduplication.hashing.UriHashCustodian
Modifier and Type | Field and Description |
---|---|
private com.mongodb.MongoClient |
client |
static String |
COLLECTION_NAME |
static String |
COLUMN_CRAWLING_IN_PROCESS |
static String |
COLUMN_HASH_VALUE |
static String |
COLUMN_IP |
static String |
COLUMN_TIMESTAMP_LAST_CRAWL |
static String |
COLUMN_TIMESTAMP_NEXT_CRAWL |
static String |
COLUMN_TYPE |
static String |
COLUMN_URI |
static String |
DB_NAME |
private static String |
DUMMY_HASH_VALUE
Used as a default hash value for URIS, will be replaced by real hash value as
soon as it has been computed.
|
private static org.slf4j.Logger |
LOGGER |
private com.mongodb.client.MongoDatabase |
mongoDB |
private static boolean |
PERSIST |
private Integer |
recrawlEveryWeek |
Constructor and Description |
---|
MongoDBKnowUriFilter(String hostName,
Integer port) |
Modifier and Type | Method and Description |
---|---|
void |
add(org.dice_research.squirrel.data.uri.CrawleableUri uri,
long nextCrawlTimestamp) |
void |
add(org.dice_research.squirrel.data.uri.CrawleableUri uri,
long lastCrawlTimestamp,
long nextCrawlTimestamp) |
void |
addHashValuesForUris(List<org.dice_research.squirrel.data.uri.CrawleableUri> uris) |
void |
close() |
long |
count() |
org.bson.Document |
crawleableUriToMongoDocument(org.dice_research.squirrel.data.uri.CrawleableUri uri) |
List<org.dice_research.squirrel.data.uri.CrawleableUri> |
getOutdatedUris() |
Set<org.dice_research.squirrel.data.uri.CrawleableUri> |
getUrisWithSameHashValues(Set<org.dice_research.squirrel.deduplication.hashing.HashValue> hashValuesForComparison) |
boolean |
isUriGood(org.dice_research.squirrel.data.uri.CrawleableUri uri) |
boolean |
knowUriTableExists() |
void |
open() |
void |
purge() |
private static final org.slf4j.Logger LOGGER
private com.mongodb.MongoClient client
private com.mongodb.client.MongoDatabase mongoDB
public static final String DB_NAME
private Integer recrawlEveryWeek
public static final String COLLECTION_NAME
public static final String COLUMN_TIMESTAMP_LAST_CRAWL
public static final String COLUMN_URI
public static final String COLUMN_CRAWLING_IN_PROCESS
public static final String COLUMN_TIMESTAMP_NEXT_CRAWL
public static final String COLUMN_IP
public static final String COLUMN_TYPE
public static final String COLUMN_HASH_VALUE
private static final boolean PERSIST
private static final String DUMMY_HASH_VALUE
public boolean isUriGood(org.dice_research.squirrel.data.uri.CrawleableUri uri)
isUriGood
in interface org.dice_research.squirrel.data.uri.filter.UriFilter
public void add(org.dice_research.squirrel.data.uri.CrawleableUri uri, long nextCrawlTimestamp)
add
in interface org.dice_research.squirrel.data.uri.filter.KnownUriFilter
public org.bson.Document crawleableUriToMongoDocument(org.dice_research.squirrel.data.uri.CrawleableUri uri)
public void close() throws IOException
close
in interface Closeable
close
in interface AutoCloseable
IOException
public void open()
open
in interface org.dice_research.squirrel.data.uri.filter.KnownUriFilter
public boolean knowUriTableExists()
public void add(org.dice_research.squirrel.data.uri.CrawleableUri uri, long lastCrawlTimestamp, long nextCrawlTimestamp)
add
in interface org.dice_research.squirrel.data.uri.filter.KnownUriFilter
public void addHashValuesForUris(List<org.dice_research.squirrel.data.uri.CrawleableUri> uris)
addHashValuesForUris
in interface org.dice_research.squirrel.deduplication.hashing.UriHashCustodian
public void purge()
public List<org.dice_research.squirrel.data.uri.CrawleableUri> getOutdatedUris()
getOutdatedUris
in interface org.dice_research.squirrel.data.uri.filter.KnownUriFilter
public long count()
count
in interface org.dice_research.squirrel.data.uri.filter.KnownUriFilter
public Set<org.dice_research.squirrel.data.uri.CrawleableUri> getUrisWithSameHashValues(Set<org.dice_research.squirrel.deduplication.hashing.HashValue> hashValuesForComparison)
getUrisWithSameHashValues
in interface org.dice_research.squirrel.deduplication.hashing.UriHashCustodian
Copyright © 2017–2020. All rights reserved.