Files: 2096f35b03799f27ac1e12ded345cae328e4d72c / ggscrape
5833 bytesRaw
1 | |
2 | # vi: ts=4 sw=4 et |
3 | # |
4 | # ggscrape(1) |
5 | # Download emails from a Google Groups |
6 | # |
7 | # Copyright (c) 2014 Charles Lehner |
8 | # Released under the terms of the MIT License. |
9 | |
10 | VERSION=1.0.0 |
11 | BASE_URL='https://groups.google.com/forum/' |
12 | |
13 | EX_USAGE=64 |
14 | topic_range=100 |
15 | |
16 | opt_verbose= |
17 | group_id= |
18 | dest_dir= |
19 | ln_dir= |
20 | cookie_str= |
21 | |
22 | if [[ -n "$GG_COOKIE" ]]; then |
23 | cookie_str="$GG_COOKIE" |
24 | fi |
25 | |
26 | debug_print() { |
27 | if [[ -n $opt_verbose ]]; then |
28 | echo $@ >&2 |
29 | fi |
30 | } |
31 | |
32 | req() { |
33 | debug_print req: "$1" |
34 | curl -sN -b "$cookie_str" "$BASE_URL$@" |
35 | } |
36 | |
37 | req_fragment() { |
38 | req "?_escaped_fragment_=$@" |
39 | } |
40 | |
41 | check_permission() { |
42 | debug_print testing permission |
43 | if ! req_fragment "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK' |
44 | then |
45 | echo 'Unable to access group.' >&2 |
46 | return 1 |
47 | fi |
48 | } |
49 | |
50 | get_topics_single() { |
51 | local start=$1 |
52 | local end=$2 |
53 | |
54 | debug_print "get topics $group_id [$start-$end]" |
55 | |
56 | req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \ |
57 | "s/^<i>Showing [^<]* of 0 topics<\/i>$//p; |
58 | s/<tr>/\0\n/; /lastPostDate/ { |
59 | s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D; |
60 | }; |
61 | /subject/{ |
62 | s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p; |
63 | }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/' |
64 | # return status 99 if no matches found |
65 | } |
66 | |
67 | unknown_option() { |
68 | echo "Unknown option $1" >&2 |
69 | exit $EX_USAGE |
70 | } |
71 | |
72 | get_topics() { |
73 | local start=$(($1)) |
74 | local end=$(($2)) |
75 | local temp_end |
76 | |
77 | if ((start==0)); then |
78 | start=1 |
79 | fi |
80 | |
81 | check_permission || return 1 |
82 | |
83 | debug_print "get all topics $group_id [$start-$end]" |
84 | |
85 | for ((i = start; i <= end || end == 0; i += topic_range)); do |
86 | ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1)) |
87 | get_topics_single $i $temp_end || break |
88 | done |
89 | } |
90 | |
91 | # get message ids in a topic |
92 | get_messages() { |
93 | local topic_id="$1" |
94 | req_fragment "topic/${group_id}/${topic_id}" | sed -n\ |
95 | 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p' |
96 | } |
97 | |
98 | fix_message() { |
99 | local file="$1" |
100 | # Some messages have two duplicate sets of headers, the second set starting |
101 | # with some X-Google stuff. Remove the second set of headers if they are |
102 | # present. |
103 | if sed -n '/^\r$/{ n; /^X-Google-Groups:/q; q 1; }' "$file"; then |
104 | debug_print 'Removing duplicate headers' |
105 | sed -i~ '/^X-Google-Groups:/,/^\r$/d' "$file" |
106 | fi |
107 | # Add missing multipart/alternative boundary |
108 | sed -i~ '/^--[^< ]*$/{ n; /^--[^< ]*$/{ s/^--\(.*\)/Content-Type: multipart\/alternative; boundary=\1\n\n\0/; }; }' "$file" |
109 | rm "$file~" |
110 | } |
111 | |
112 | download_message() { |
113 | local topic_id="$1" |
114 | local msg_id="$2" |
115 | debug_print download topic $topic_id message $msg_id |
116 | path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml" |
117 | if [[ -s "$path" ]]; then |
118 | echo "message ${topic_id}${msg_id} already downloaded. skipping." |
119 | else |
120 | echo "message ${topic_id}${msg_id} downloading." |
121 | temp=$(mktemp) |
122 | if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp" |
123 | then |
124 | fix_message "$temp" |
125 | mv "$temp" "$path" |
126 | if [[ -n "$ln_dir" ]]; then |
127 | ln "$path" "$ln_dir" |
128 | fi |
129 | else |
130 | echo "message ${topic_id}${msg_id} failed to download." >&2 |
131 | rm "$temp" |
132 | fi |
133 | fi |
134 | } |
135 | |
136 | download_messages() { |
137 | local topic_id="$1" |
138 | debug_print download topic $topic_id |
139 | get_messages "$topic_id" | while read msg_id _; do |
140 | download_message "$topic_id" "$msg_id" |
141 | done |
142 | } |
143 | |
144 | download_mails() { |
145 | local start=$1 |
146 | local end=$2 |
147 | |
148 | mkdir -p "$dest_dir" || exit 1 |
149 | get_topics "$start" "$end" | while read topic_id topic_title; do |
150 | download_messages $topic_id |
151 | done |
152 | } |
153 | |
154 | show_version() { |
155 | echo ggscrape $VERSION |
156 | } |
157 | |
158 | show_help() { |
159 | cat << EOF |
160 | ggscrape. Download emails from a Google Group |
161 | |
162 | Usage: |
163 | ggscrape <group_id> test |
164 | ggscrape <group_id> topics |
165 | ggscrape <group_id> messages <topic_id> |
166 | ggscrape <group_id> download <dest_dir> |
167 | ggscrape fix_message <file> |
168 | |
169 | Options: |
170 | -h, --help Show this screen |
171 | --version Show version |
172 | -v, --verbose Show debug info |
173 | -c, --cookie <cookie> Use the given cookie string |
174 | -b, --begin <topicnum> Topic number at which to begin downloading |
175 | -e, --end <topicnum> Topic number at which to stop downloading |
176 | -l, --ln <ln_dir> Hard link email files into this directory |
177 | |
178 | Environmental variables: |
179 | |
180 | GG_COOKIE use as value for --cookie |
181 | |
182 | EOF |
183 | } |
184 | |
185 | cmd= |
186 | topic_id= |
187 | topic_begin= |
188 | topic_end= |
189 | fix_file= |
190 | |
191 | while [[ "$#" -gt 0 ]]; do |
192 | case "$1" in |
193 | -h|--help) show_help; exit;; |
194 | --version) show_version; exit;; |
195 | -v|--verbose) opt_verbose=1;; |
196 | -c|--cookie) cookie_str="$2"; shift;; |
197 | -b|--begin) topic_begin="$2"; shift;; |
198 | -e|--end) topic_end="$2"; shift;; |
199 | -l|--ln) ln_dir="$2"; shift;; |
200 | topics) cmd=topics;; |
201 | test) cmd=test;; |
202 | messages) cmd=messages; topic_id="$2"; shift;; |
203 | download) cmd=download; dest_dir="$2"; shift;; |
204 | fix_message) cmd=fix_message; fix_file="$2"; shift;; |
205 | *) if [[ -z "$group_id" ]]; then |
206 | group_id="$1" |
207 | else |
208 | unknown_option "$1" |
209 | fi;; |
210 | esac |
211 | shift |
212 | done |
213 | |
214 | if [[ "$cmd" == fix_message ]]; then |
215 | if [[ -z "$fix_file" ]]; then |
216 | show_help |
217 | exit 1 |
218 | fi |
219 | fix_message "$fix_file" |
220 | exit |
221 | fi |
222 | |
223 | if [[ -z "$group_id" ]]; then |
224 | show_help |
225 | exit 1 |
226 | fi |
227 | |
228 | case "$cmd" in |
229 | '') show_help; exit;; |
230 | test) check_permission && echo Success;; |
231 | topics) get_topics "$topic_begin" "$topic_end";; |
232 | download) download_mails "$topic_begin" "$topic_end";; |
233 | messages) get_messages "$topic_id";; |
234 | *) echo "Unknown command $cmd" >&2;; |
235 | esac |
236 |
Built with git-ssb-web