Commit 83739963420642ad8021b66ab2f68cedaa6a99e6
Get message downloads working
Charles Lehner committed on 9/12/2014, 6:16:32 PMParent: 1f4de2eeab8c60b37f065c88bd87aa3ba6c51e0d
Files changed
ggscrape | changed |
ggscrape | ||
---|---|---|
@@ -7,9 +7,9 @@ | ||
7 | 7 … | # Copyright (c) 2014 Charles Lehner |
8 | 8 … | # Released under the terms of the MIT License. |
9 | 9 … | |
10 | 10 … | VERSION=0.0.1 |
11 | -BASE_URL='https://groups.google.com/forum/?_escaped_fragment_=' | |
11 … | +BASE_URL='https://groups.google.com/forum/' | |
12 | 12 … | |
13 | 13 … | EX_USAGE=64 |
14 | 14 … | topic_range=100 |
15 | 15 … | |
@@ -28,24 +28,28 @@ | ||
28 | 28 … | fi |
29 | 29 … | } |
30 | 30 … | |
31 | 31 … | req() { |
32 | - debug_print query: "$1" | |
32 … | + debug_print req: "$1" | |
33 | 33 … | curl -sN -b "$cookie_str" "$BASE_URL$@" |
34 | 34 … | } |
35 | 35 … | |
36 … | +req_fragment() { | |
37 … | + req "?_escaped_fragment_=$@" | |
38 … | +} | |
39 … | + | |
36 | 40 … | test_permission() { |
37 | 41 … | debug_print testing permission |
38 | - req "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK' | |
42 … | + req_fragment "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK' | |
39 | 43 … | } |
40 | 44 … | |
41 | 45 … | get_topics_single() { |
42 | 46 … | local start=$1 |
43 | 47 … | local end=$2 |
44 | 48 … | |
45 | 49 … | debug_print "get topics $group_id [$start-$end]" |
46 | 50 … | |
47 | - req "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \ | |
51 … | + req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \ | |
48 | 52 … | "s/^<i>Showing [^<]* of 0 topics<\/i>$//p; |
49 | 53 … | s/<tr>/\0\n/; /lastPostDate/ { |
50 | 54 … | s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D; |
51 | 55 … | }; |
@@ -64,9 +68,9 @@ | ||
64 | 68 … | local start=$(($1)) |
65 | 69 … | local end=$(($2)) |
66 | 70 … | local temp_end |
67 | 71 … | |
68 | - if [[ -z "$start" ]]; then | |
72 … | + if ((start==0)); then | |
69 | 73 … | start=1 |
70 | 74 … | fi |
71 | 75 … | |
72 | 76 … | debug_print "get all topics $group_id [$start-$end]" |
@@ -79,9 +83,9 @@ | ||
79 | 83 … | |
80 | 84 … | # get message ids in a topic |
81 | 85 … | get_messages() { |
82 | 86 … | local topic_id="$1" |
83 | - req "topic/${group_id}/${topic_id}" | sed -n\ | |
87 … | + req_fragment "topic/${group_id}/${topic_id}" | sed -n\ | |
84 | 88 … | 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p' |
85 | 89 … | } |
86 | 90 … | |
87 | 91 … | download_message() { |
@@ -89,11 +93,19 @@ | ||
89 | 93 … | local msg_id="$2" |
90 | 94 … | debug_print download topic $topic_id message $msg_id |
91 | 95 … | path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml" |
92 | 96 … | if [[ -s "$path" ]]; then |
93 | - echo "file for message ${msg_id} already exists. skipping." | |
97 … | + echo "message ${topic_id}${msg_id} already downloaded. skipping." | |
94 | 98 … | else |
95 | - req "${group_id}/${topic_id}/${msg_id}" > "$path" | |
99 … | + echo "message ${topic_id}${msg_id} downloading." | |
100 … | + temp=$(mktemp) | |
101 … | + if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp" | |
102 … | + then | |
103 … | + mv "$temp" "$path" | |
104 … | + else | |
105 … | + echo "message ${topic_id}${msg_id} failed to download." >&2 | |
106 … | + rm "$temp" | |
107 … | + fi | |
96 | 108 … | fi |
97 | 109 … | } |
98 | 110 … | |
99 | 111 … | download_messages() { |
Built with git-ssb-web