Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds bdbmodule max file size option #412

Merged
merged 3 commits into from
Jul 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion commons/src/main/java/org/archive/bdb/BdbModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,14 @@ public boolean getUseSharedCache() {
public void setUseSharedCache(boolean useSharedCache) {
this.useSharedCache = useSharedCache;
}

protected long maxLogFileSize = 10000000;
public long getMaxLogFileSize() {
return maxLogFileSize;
}
public void setMaxLogFileSize(long cacheSize) {
this.maxLogFileSize = maxLogFileSize;
}

/**
* Expected number of concurrent threads; used to tune nLockTables
Expand Down Expand Up @@ -277,7 +285,10 @@ protected void setup(File f, boolean create)
config.setConfigParam("je.lock.nLockTables", Long.toString(nLockTables));

// triple this value to 6K because stats show many faults
config.setConfigParam("je.log.faultReadSize", "6144");
config.setConfigParam("je.log.faultReadSize", "6144");

// set max bdb log file size. default 10M
config.setConfigParam("je.log.fileMax", Long.toString(getMaxLogFileSize()));

if(!getUseHardLinkCheckpoints()) {
// to support checkpoints by textual manifest only,
Expand Down
1 change: 1 addition & 0 deletions modules/src/main/java/org/archive/modules/CrawlURI.java
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,7 @@ public void processingCleanup() {
this.httpRecorder = null;
this.fetchStatus = S_UNATTEMPTED;
this.setPrerequisite(false);
this.clearPrerequisiteUri();
this.contentSize = UNCALCULATED;
this.contentLength = UNCALCULATED;
// Clear 'links extracted' flag.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ protected ProcessResult write(CrawlURI curi, long recordLength,
// We just closed the file because it was larger than maxBytes.
// Add to the totalBytesWritten the size of the first record
// in the file, if any.
setTotalBytesWritten(getTotalBytesWritten() +
(writer.getPosition() - position));
addTotalBytesWritten(writer.getPosition() - position);
position = writer.getPosition();
}

Expand All @@ -155,8 +154,7 @@ protected ProcessResult write(CrawlURI curi, long recordLength,
throw e;
} finally {
if (writer != null) {
setTotalBytesWritten(getTotalBytesWritten() +
(writer.getPosition() - position));
addTotalBytesWritten(writer.getPosition() - position);
getPool().returnFile(writer);

String filename = writer.getFile().getName();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ protected void updateMetadataAfterWrite(final CrawlURI curi,
+ WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK)
+ " bytes to " + writer.getFile().getName() + " for " + curi);
}
setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - startPosition));
addTotalBytesWritten(writer.getPosition() - startPosition);

curi.addExtraInfo("warcFilename", writer.getFilenameWithoutOccupiedSuffix());
curi.addExtraInfo("warcFileOffset", startPosition);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ protected ProcessResult write(final CrawlURI curi)
// We rolled over to a new warc and wrote a warcinfo record.
// Tally stats and reset temp stats, to avoid including warcinfo
// record in stats for current url.
setTotalBytesWritten(getTotalBytesWritten() +
(writer.getPosition() - position));
addTotalBytesWritten(writer.getPosition() - position);
addStats(writer.getTmpStats());
writer.resetTmpStats();
writer.resetTmpRecordLog();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,7 @@ protected ProcessResult write(final String lowerCaseScheme,
// We rolled over to a new warc and wrote a warcinfo record.
// Tally stats and reset temp stats, to avoid including warcinfo
// record in stats for current url.
setTotalBytesWritten(getTotalBytesWritten() +
(writer.getPosition() - position));
addTotalBytesWritten(writer.getPosition() - position);
addStats(writer.getTmpStats());
writer.resetTmpStats();
writer.resetTmpRecordLog();
Expand Down Expand Up @@ -647,6 +646,7 @@ protected URI qualifyRecordID(final URI base, final String key,
protected JSONObject toCheckpointJson() throws JSONException {
JSONObject json = super.toCheckpointJson();
json.put("urlsWritten", urlsWritten);
json.put("totalBytesWritten", getTotalBytesWritten());
json.put("stats", stats);
return json;
}
Expand All @@ -660,6 +660,9 @@ protected void fromCheckpointJson(JSONObject json) throws JSONException {
if (json.has("urlsWritten")) {
urlsWritten.set(json.getLong("urlsWritten"));
}
if (json.has("totalBytesWritten")) {
setTotalBytesWritten(json.getLong("totalBytesWritten"));
}

if (json.has("stats")) {
HashMap<String, Map<String, Long>> cpStats = new HashMap<String, Map<String, Long>>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Logger;

import org.archive.checkpointing.Checkpoint;
Expand Down Expand Up @@ -273,7 +274,7 @@ public void setStorePaths(List<ConfigPath> paths) {
/**
* Total number of bytes written to disc.
*/
private long totalBytesWritten = 0;
private AtomicLong totalBytesWritten = new AtomicLong();

private AtomicInteger serial = new AtomicInteger();

Expand Down Expand Up @@ -315,7 +316,7 @@ protected ProcessResult checkBytesWritten() {
if (max <= 0) {
return ProcessResult.PROCEED;
}
if (max <= this.totalBytesWritten) {
if (max <= getTotalBytesWritten()) {
return ProcessResult.FINISH; // FIXME: Specify reason
// controller.requestCrawlStop(CrawlStatus.FINISHED_WRITE_LIMIT);
}
Expand Down Expand Up @@ -435,11 +436,14 @@ protected void setPool(WriterPool pool) {
}

protected long getTotalBytesWritten() {
return totalBytesWritten;
return totalBytesWritten.get();
}

protected void setTotalBytesWritten(long totalBytesWritten) {
this.totalBytesWritten = totalBytesWritten;
this.totalBytesWritten.set(totalBytesWritten);
}
protected void addTotalBytesWritten(long bytesWritten) {
this.totalBytesWritten.addAndGet(bytesWritten);
}

public abstract List<String> getMetadata();
Expand Down