diff options
author | 2017-11-17 12:21:49 +0000 | |
---|---|---|
committer | 2017-11-17 12:21:49 +0000 | |
commit | ab44569cb52f2e6aa4d7d325558a20548aa3ec2c (patch) | |
tree | 1846f937dc5add0d9d5f0cef9fbac12f34dbccd6 /doc | |
parent | 745591de00987f104d476e4241304c91c7c932db (diff) |
Diffstat (limited to 'doc')
-rw-r--r-- | doc/forum/Adding_a_URL_from_wayback_machine.mdwn | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/doc/forum/Adding_a_URL_from_wayback_machine.mdwn b/doc/forum/Adding_a_URL_from_wayback_machine.mdwn new file mode 100644 index 000000000..49eeef470 --- /dev/null +++ b/doc/forum/Adding_a_URL_from_wayback_machine.mdwn @@ -0,0 +1,64 @@ +Is there a nice way to add a URL from the Wayback machine? +I don't want to add HTML documents but rather PDFs etc. +The problem is that web.archive.org doesn't send a Content-Length header: + + $ curl -I http://web.archive.org/web/20171117120847/https://downloads.kitenet.net/videos/git-annex/git-annex_views_demo.ogg + HTTP/1.1 200 OK + Server: Tengine/2.1.0 + Date: Fri, 17 Nov 2017 12:09:35 GMT + Content-Type: audio/ogg + Connection: keep-alive + X-Archive-Orig-content-length: 29784324 + X-Archive-Orig-accept-ranges: bytes + X-Archive-Orig-server: Apache/2.4.29 (Debian) + X-Archive-Orig-last-modified: Wed, 11 Feb 2015 01:31:47 GMT + X-Archive-Orig-connection: close + X-Archive-Orig-etag: "1c67904-50ec5f783257c" + X-Archive-Orig-date: Fri, 17 Nov 2017 12:08:49 GMT + Cache-Control: max-age=1800 + X-Archive-Guessed-Content-Type: audio/ogg + Memento-Datetime: Fri, 17 Nov 2017 12:08:47 GMT + Link: <https://downloads.kitenet.net/videos/git-annex/git-annex_views_demo.ogg>; rel="original", <http://web.archive.org/web/timemap/link/https://downloads.kitenet.net/videos/git-annex/git-annex_views_demo.ogg>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/https://downloads.kitenet.net/videos/git-annex/git-annex_views_demo.ogg>; rel="timegate", <http://web.archive.org/web/20140426045733/https://downloads.kitenet.net/videos/git-annex/git-annex_views_demo.ogg>; rel="first memento"; datetime="Sat, 26 Apr 2014 04:57:33 GMT", <http://web.archive.org/web/20170822174608/https://downloads.kitenet.net/videos/git-annex/git-annex_views_demo.ogg>; rel="prev memento"; datetime="Tue, 22 Aug 2017 17:46:08 GMT", <http://web.archive.org/web/20171117120847/https://downloads.kitenet.net/videos/git-annex/git-annex_views_demo.ogg>; rel="memento"; datetime="Fri, 17 Nov 2017 12:08:47 GMT", <http://web.archive.org/web/20171117120847/https://downloads.kitenet.net/videos/git-annex/git-annex_views_demo.ogg>; rel="last memento"; datetime="Fri, 17 Nov 2017 12:08:47 GMT" + Content-Security-Policy: default-src 'self' 'unsafe-eval' 'unsafe-inline' data: archive.org web.archive.org analytics.archive.org + X-App-Server: wwwb-app39 + X-ts: ---- + X-Archive-Playback: 0 + X-location: All + X-Page-Cache: MISS + +Therefore, I'd have to pass the `--relaxed` flag when calling `addurl`. +Is there maybe a way to tell git-annex to download the file before checking its size? +Or is there a way to use the Wayback machine via the S3 special remote? + +Eventually, I'd like to have a script that automatically saves all web content in my repo +into the Wayback machine and adds the new URLs for redundancy. + +This is what I've got so far: + + #! /usr/bin/env python3 + + from subprocess import check_output, Popen, PIPE + import json + from urllib import request + from sys import stdout + + output = check_output(['git-annex', 'find', '--in', 'web', '--json']) + files = [json.loads(line)['file'] for line in output.split(b'\n')[:-1]] + p = Popen(['git-annex', 'whereis', '--batch', '--json'], stdin=PIPE, stdout=PIPE) + p.stdin.writelines(str.encode(f + '\n') for f in files) + out, err = p.communicate() + urls = [] + for line in out.split(b'\n')[:-1]: + for loc in json.loads(line)['untrusted']: + if loc['uuid'] == '00000000-0000-0000-0000-000000000001': + url = loc['urls'][0] + import urllib.request + print('GET https://web.archive.org/save/' + url) + r = request.urlopen('https://web.archive.org/save/' + url) + assert(r.getcode() == 200) + urls.append(r.geturl()) + break + assert(len(files) == len(urls)) + p = Popen(['git-annex', 'addurl', '--batch', '--with-files', '--relzed'], stdin=PIPE, stdout=stdout) + p.stdin.writelines(str.encode(f + ' ' + u) for f, u in zip(urls, files)) + p.communicate() |