Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix error log reporting of batch size for trough crawl logs #367

Merged
merged 2 commits into from
Feb 26, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ public class TroughCrawlLogFeed extends Processor implements Lifecycle {

protected static final Logger logger = Logger.getLogger(TroughCrawlLogFeed.class.getName());

protected static final int BATCH_MAX_TIME_MS = 20 * 1000;
protected static final int BATCH_MAX_TIME_MS = 60 * 1000;
protected static final int BATCH_MAX_SIZE = 400;

protected KeyedProperties kp = new KeyedProperties();
Expand Down Expand Up @@ -252,67 +252,75 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
}

protected void postCrawledBatch() {
logger.info("posting batch of " + crawledBatch.size() + " crawled urls trough segment " + getSegmentId());
Object[] flattenedValues = null;
StringBuffer sqlTmpl = new StringBuffer();
synchronized (crawledBatch) {
if (uncrawledBatch.size() >= BATCH_MAX_SIZE || System.currentTimeMillis() - uncrawledBatchLastTime > BATCH_MAX_TIME_MS) {
crawledBatchLastTime = System.currentTimeMillis();
if (!crawledBatch.isEmpty()) {
sqlTmpl.append("insert into crawled_url ("
+ "timestamp, status_code, size, payload_size, url, hop_path, is_seed_redirect, "
+ "via, mimetype, content_digest, seed, is_duplicate, warc_filename, "
+ "warc_offset, warc_content_bytes, host) values "
+ "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)");
for (int i = 1; i < crawledBatch.size(); i++) {
sqlTmpl.append(", (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)");
}
Object[] flattenedValues = null;
StringBuffer sqlTmpl = new StringBuffer();
synchronized (crawledBatch) {
if (uncrawledBatch.size() >= BATCH_MAX_SIZE || System.currentTimeMillis() - uncrawledBatchLastTime > BATCH_MAX_TIME_MS) {
crawledBatchLastTime = System.currentTimeMillis();
if (!crawledBatch.isEmpty()) {
logger.info("posting batch of " + crawledBatch.size() + " crawled urls trough segment " + getSegmentId());
sqlTmpl.append("insert into crawled_url ("
+ "timestamp, status_code, size, payload_size, url, hop_path, is_seed_redirect, "
+ "via, mimetype, content_digest, seed, is_duplicate, warc_filename, "
+ "warc_offset, warc_content_bytes, host) values "
+ "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)");
for (int i = 1; i < crawledBatch.size(); i++) {
sqlTmpl.append(", (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)");
}

flattenedValues = new Object[16 * crawledBatch.size()];
for (int i = 0; i < crawledBatch.size(); i++) {
System.arraycopy(crawledBatch.get(i), 0, flattenedValues, 16 * i, 16);
}
crawledBatch.clear();
}
}
}
if(flattenedValues !=null && flattenedValues.length > 0) {
try {
troughClient().write(getSegmentId(), sqlTmpl.toString(), flattenedValues);
} catch (Exception e) {
logger.log(Level.WARNING, "problem posting batch of " + flattenedValues.length + " crawled urls to trough segment " + getSegmentId(), e);
}

crawledBatchLastTime = System.currentTimeMillis();
}
flattenedValues = new Object[16 * crawledBatch.size()];
for (int i = 0; i < crawledBatch.size(); i++) {
System.arraycopy(crawledBatch.get(i), 0, flattenedValues, 16 * i, 16);
}
crawledBatch.clear();
}
}
}
if(flattenedValues !=null && flattenedValues.length > 0) {
try {
synchronized (getSegmentId()) { //avoids 500 due to locked db from posting from uncrawled batch
troughClient().write(getSegmentId(), sqlTmpl.toString(), flattenedValues);
}
} catch (Exception e) {
logger.log(Level.WARNING, "problem posting batch of " + (flattenedValues.length/16) + " crawled urls to trough segment " + getSegmentId(), e);
}
crawledBatchLastTime = System.currentTimeMillis();
}
}
protected void postUncrawledBatch() {
logger.info("posting batch of " + uncrawledBatch.size() + " uncrawled urls trough segment " + getSegmentId());
Object[] flattenedValues = null;
StringBuffer sqlTmpl = new StringBuffer();
synchronized (uncrawledBatch) {
if (!uncrawledBatch.isEmpty()) {
StringBuffer sqlTmpl = new StringBuffer();
sqlTmpl.append(
"insert into uncrawled_url (timestamp, url, hop_path, status_code, via, seed, host)"
+ " values (%s, %s, %s, %s, %s, %s, %s)");
if (uncrawledBatch.size() >= BATCH_MAX_SIZE || System.currentTimeMillis() - uncrawledBatchLastTime > BATCH_MAX_TIME_MS) {
uncrawledBatchLastTime = System.currentTimeMillis();
if (!uncrawledBatch.isEmpty()) {
logger.info("posting batch of " + uncrawledBatch.size() + " uncrawled urls trough segment " + getSegmentId());
sqlTmpl.append(
"insert into uncrawled_url (timestamp, url, hop_path, status_code, via, seed, host)"
+ " values (%s, %s, %s, %s, %s, %s, %s)");

for (int i = 1; i < uncrawledBatch.size(); i++) {
sqlTmpl.append(", (%s, %s, %s, %s, %s, %s, %s)");
}
for (int i = 1; i < uncrawledBatch.size(); i++) {
sqlTmpl.append(", (%s, %s, %s, %s, %s, %s, %s)");
}

Object[] flattenedValues = new Object[7 * uncrawledBatch.size()];
for (int i = 0; i < uncrawledBatch.size(); i++) {
System.arraycopy(uncrawledBatch.get(i), 0, flattenedValues, 7 * i, 7);
flattenedValues = new Object[7 * uncrawledBatch.size()];
for (int i = 0; i < uncrawledBatch.size(); i++) {
System.arraycopy(uncrawledBatch.get(i), 0, flattenedValues, 7 * i, 7);
}
uncrawledBatch.clear();
}

try {
}
}
if(flattenedValues !=null && flattenedValues.length > 0) {
try {
synchronized (getSegmentId()) {
troughClient().write(getSegmentId(), sqlTmpl.toString(), flattenedValues);
} catch (Exception e) {
logger.log(Level.WARNING, "problem posting batch of " + uncrawledBatch.size() + " uncrawled urls to trough segment " + getSegmentId(), e);
}

uncrawledBatchLastTime = System.currentTimeMillis();
uncrawledBatch.clear();
} catch (Exception e) {
logger.log(Level.WARNING, "problem posting batch of " + uncrawledBatch.size() + " uncrawled urls to trough segment " + getSegmentId(), e);
}
uncrawledBatchLastTime = System.currentTimeMillis();

}
}
}