Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Old Hbase and Kafka Classes #313

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
f5a49e7
Remove Hbase support.
BitBaron Feb 13, 2020
9754566
Remove Kafka support.
BitBaron Feb 13, 2020
ea374b3
Merge pull request #317 from internetarchive/fixes-extractor-multiple…
jkafader Apr 6, 2020
4999843
best medium-ish size
galgeek Apr 3, 2020
0f8fbaf
Merge pull request #327 from internetarchive/yt-dl-format-medium-ish
BitBaron Apr 20, 2020
633a7cf
no youtube-dl cache dir
galgeek Apr 28, 2020
aabd16b
don't youtube-dl receivedFromAMQP
galgeek Apr 23, 2020
0a5d46d
Merge pull request #330 from internetarchive/less_youtubedl
jkafader Apr 30, 2020
ce409c0
Merge pull request #329 from internetarchive/ydl-no-cache
jkafader Apr 30, 2020
4763a8a
Merge pull request #334 from internetarchive/fixes-ftp-response-recor…
ato Jun 1, 2020
947507b
Revert "Warc convention for storing ftp responses has been to use a W…
ato Jun 1, 2020
812e96d
Merge pull request #336 from internetarchive/revert-334-fixes-ftp-res…
ato Jun 1, 2020
1b8e7f7
youtube-dl --no-playlist
galgeek Jul 29, 2020
00d1c46
Ensure Replay Input Stream and File Channels are closed after writing
adam-miller Jul 30, 2020
52a8f34
Fixing up logging and comments
adam-miller Jul 30, 2020
a9c0c65
Manage youtube dl temp files which can be closed in the warc writer.
adam-miller Jul 31, 2020
34bb996
Merge pull request #346 from internetarchive/fixes-leaky-file-handles
jkafader Aug 3, 2020
f879226
Merge pull request #342 from internetarchive/noplaylist-ydl
adam-miller Aug 3, 2020
ddf8b68
Merge remote-tracking branch 'upstream/master-ait-contrib' into maste…
BitBaron Aug 6, 2020
38f627b
Revert "Remove Hbase support."
BitBaron Aug 6, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.frontier.AMQPUrlReceiver;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.warc.WARCRecordInfo;
Expand Down Expand Up @@ -117,17 +118,52 @@ public class ExtractorYoutubeDL extends Extractor
// unnamed toethread-local temporary file
protected transient ThreadLocal<RandomAccessFile> tempfile = new ThreadLocal<RandomAccessFile>() {
protected RandomAccessFile initialValue() {
File t;
try {
t = File.createTempFile("ydl", ".json");
RandomAccessFile f = new RandomAccessFile(t, "rw");
t.delete();
return f;
} catch (IOException e) {
throw new RuntimeException(e);
}
return null;
}
};
protected void closeLocalTempFile() {
RandomAccessFile localTemp = tempfile.get();
if(localTemp == null || !isOpen(localTemp))
return; // avoid making a new temp file just to close it immediately
try {
getLocalTempFile().close();
tempfile.set(null);
}
catch (Exception e) {
logger.log(Level.WARNING, "problem closing ydl temp file " + e);
}
}
protected RandomAccessFile getLocalTempFile() {
RandomAccessFile localTemp = tempfile.get();
if(localTemp == null || !isOpen(localTemp)) {
localTemp = openNewTempFile();
tempfile.set(localTemp);
}
logger.info("Getting youtube-dl temp file ");
return localTemp;
}
protected boolean isOpen(RandomAccessFile f) {
try {
f.length();
return true;
}
catch (IOException e) {
logger.info("youtube-dl temp file is not open");
return false ;
}
}
protected RandomAccessFile openNewTempFile() {
logger.info("Opening New youtube-dl temp file ");
File t;
try {
t = File.createTempFile("ydl", ".json");
RandomAccessFile f = new RandomAccessFile(t, "rw");
t.delete();
return f;
} catch (IOException e) {
throw new RuntimeException(e);
}
}

protected CrawlerLoggerModule crawlerLoggerModule;
public CrawlerLoggerModule getCrawlerLoggerModule() {
Expand Down Expand Up @@ -419,7 +455,8 @@ protected YoutubeDLResults runYoutubeDL(CrawlURI uri) {
* https://github.com/ytdl-org/youtube-dl/blob/master/README.md#format-selection
*/
ProcessBuilder pb = new ProcessBuilder("youtube-dl", "--ignore-config",
"--simulate", "--dump-single-json", "--format=best",
"--simulate", "--dump-single-json", "--format=best[height <=? 576]",
"--no-cache-dir", "--no-playlist",
"--playlist-end=" + MAX_VIDEOS_PER_PAGE, uri.toString());
logger.info("running: " + String.join(" ", pb.command()));

Expand All @@ -446,7 +483,7 @@ public String call() throws IOException {
}
});

YoutubeDLResults results = new YoutubeDLResults(tempfile.get());
YoutubeDLResults results = new YoutubeDLResults(getLocalTempFile());

try {
try {
Expand Down Expand Up @@ -507,6 +544,11 @@ protected boolean shouldExtract(CrawlURI uri) {
return false;
}

// skip crawl uris received from umbra
if (uri.getAnnotations().contains(AMQPUrlReceiver.A_RECEIVED_FROM_AMQP)) {
return false;
}

String mime = uri.getContentType().toLowerCase();
if (mime.startsWith("text/html")
|| mime.startsWith("application/xhtml")
Expand All @@ -524,7 +566,14 @@ public boolean shouldBuildRecord(CrawlURI uri) {
// should build record for containing page, which has an
// annotation like "youtube-dl:3" (no slash)
String annotation = findYdlAnnotation(uri);
return annotation != null && !annotation.contains("/");
boolean shouldBuild = (annotation != null && !annotation.contains("/"));

// If we processed this uri, then we have an open temp file that won't get closed
// for us by the warc writer
if(!shouldBuild)
closeLocalTempFile();

return shouldBuild;
}

@Override
Expand All @@ -545,10 +594,10 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo)
recordInfo.setMimetype("application/vnd.youtube-dl_formats+json;charset=utf-8");
recordInfo.setEnforceLength(true);

tempfile.get().seek(0);
InputStream inputStream = Channels.newInputStream(tempfile.get().getChannel());
getLocalTempFile().seek(0);
InputStream inputStream = Channels.newInputStream(getLocalTempFile().getChannel());
recordInfo.setContentStream(inputStream);
recordInfo.setContentLength(tempfile.get().length());
recordInfo.setContentLength(getLocalTempFile().length());

logger.info("built record timestamp=" + timestamp + " url=" + recordInfo.getUrl());

Expand All @@ -574,7 +623,7 @@ public static void main(String[] args) throws IOException {
ExtractorYoutubeDL e = new ExtractorYoutubeDL();

FileInputStream in = new FileInputStream("/tmp/ydl-single-video.json");
YoutubeDLResults results = new YoutubeDLResults(e.tempfile.get());
YoutubeDLResults results = new YoutubeDLResults(e.getLocalTempFile());
e.streamYdlOutput(in, results);
System.out.println("video urls: " + results.videoUrls);
System.out.println("page urls: " + results.pageUrls);
Expand All @@ -590,7 +639,7 @@ public static void main(String[] args) throws IOException {
}

in = new FileInputStream("/tmp/ydl-uncgreensboro-limited.json");
results = new YoutubeDLResults(e.tempfile.get());
results = new YoutubeDLResults(e.getLocalTempFile());
e.streamYdlOutput(in, results);
System.out.println("video urls: " + results.videoUrls);
System.out.println("page urls: " + results.pageUrls);
Expand Down
Loading