Files: 806ab97eba21eba1b9b85de34dca1fbeb197f602 / ggscrape
5655 bytesRaw
1 | |
2 | # vi: ts=4 sw=4 et |
3 | # |
4 | # ggscrape(1) |
5 | # Download emails from a Google Groups |
6 | # |
7 | # Copyright (c) 2014 Charles Lehner |
8 | # Released under the terms of the MIT License. |
9 | |
10 | VERSION=1.0.0 |
11 | BASE_URL='https://groups.google.com/forum/' |
12 | |
13 | EX_USAGE=64 |
14 | topic_range=100 |
15 | |
16 | opt_verbose= |
17 | group_id= |
18 | dest_dir= |
19 | ln_dir= |
20 | cookie_str= |
21 | |
22 | if [[ -n "$GG_COOKIE" ]]; then |
23 | cookie_str="$GG_COOKIE" |
24 | fi |
25 | |
26 | debug_print() { |
27 | if [[ -n $opt_verbose ]]; then |
28 | echo $@ >&2 |
29 | fi |
30 | } |
31 | |
32 | req() { |
33 | debug_print req: "$1" |
34 | curl -sN -b "$cookie_str" "$BASE_URL$@" |
35 | } |
36 | |
37 | req_fragment() { |
38 | req "?_escaped_fragment_=$@" |
39 | } |
40 | |
41 | check_permission() { |
42 | debug_print testing permission |
43 | if ! req_fragment "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK' |
44 | then |
45 | echo 'Unable to access group.' >&2 |
46 | return 1 |
47 | fi |
48 | } |
49 | |
50 | get_topics_single() { |
51 | local start=$1 |
52 | local end=$2 |
53 | |
54 | debug_print "get topics $group_id [$start-$end]" |
55 | |
56 | req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \ |
57 | "s/^<i>Showing [^<]* of 0 topics<\/i>$//p; |
58 | s/<tr>/\0\n/; /lastPostDate/ { |
59 | s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D; |
60 | }; |
61 | /subject/{ |
62 | s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p; |
63 | }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/' |
64 | # return status 99 if no matches found |
65 | } |
66 | |
67 | unknown_option() { |
68 | echo "Unknown option $1" >&2 |
69 | exit $EX_USAGE |
70 | } |
71 | |
72 | get_topics() { |
73 | local start=$(($1)) |
74 | local end=$(($2)) |
75 | local temp_end |
76 | |
77 | if ((start==0)); then |
78 | start=1 |
79 | fi |
80 | |
81 | check_permission || return 1 |
82 | |
83 | debug_print "get all topics $group_id [$start-$end]" |
84 | |
85 | for ((i = start; i <= end || end == 0; i += topic_range)); do |
86 | ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1)) |
87 | get_topics_single $i $temp_end || break |
88 | done |
89 | } |
90 | |
91 | # get message ids in a topic |
92 | get_messages() { |
93 | local topic_id="$1" |
94 | req_fragment "topic/${group_id}/${topic_id}" | sed -n\ |
95 | 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p' |
96 | } |
97 | |
98 | fix_message() { |
99 | local file="$1" |
100 | # Some messages have two duplicate sets of headers, the second set starting |
101 | # with some X-Google stuff. Remove the second set of headers if they are |
102 | # present. |
103 | if sed -n '/^\r$/{ n; /^X-Google-Groups:/q; q 1; }' "$file"; then |
104 | debug_print 'Removing duplicate headers' |
105 | sed -i~ '/^X-Google-Groups:/,/^\r$/d' "$file" && rm "$file~" |
106 | fi |
107 | } |
108 | |
109 | download_message() { |
110 | local topic_id="$1" |
111 | local msg_id="$2" |
112 | debug_print download topic $topic_id message $msg_id |
113 | path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml" |
114 | if [[ -s "$path" ]]; then |
115 | echo "message ${topic_id}${msg_id} already downloaded. skipping." |
116 | else |
117 | echo "message ${topic_id}${msg_id} downloading." |
118 | temp=$(mktemp) |
119 | if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp" |
120 | then |
121 | fix_message "$temp" |
122 | mv "$temp" "$path" |
123 | if [[ -n "$ln_dir" ]]; then |
124 | ln "$path" "$ln_dir" |
125 | fi |
126 | else |
127 | echo "message ${topic_id}${msg_id} failed to download." >&2 |
128 | rm "$temp" |
129 | fi |
130 | fi |
131 | } |
132 | |
133 | download_messages() { |
134 | local topic_id="$1" |
135 | debug_print download topic $topic_id |
136 | get_messages "$topic_id" | while read msg_id _; do |
137 | download_message "$topic_id" "$msg_id" |
138 | done |
139 | } |
140 | |
141 | download_mails() { |
142 | local start=$1 |
143 | local end=$2 |
144 | |
145 | mkdir -p "$dest_dir" || exit 1 |
146 | get_topics "$start" "$end" | while read topic_id topic_title; do |
147 | download_messages $topic_id |
148 | done |
149 | } |
150 | |
151 | show_version() { |
152 | echo ggscrape $VERSION |
153 | } |
154 | |
155 | show_help() { |
156 | cat << EOF |
157 | ggscrape. Download emails from a Google Group |
158 | |
159 | Usage: |
160 | ggscrape <group_id> test |
161 | ggscrape <group_id> topics |
162 | ggscrape <group_id> messages <topic_id> |
163 | ggscrape <group_id> download <dest_dir> |
164 | ggscrape fix_message <file> |
165 | |
166 | Options: |
167 | -h, --help Show this screen |
168 | --version Show version |
169 | -v, --verbose Show debug info |
170 | -c, --cookie <cookie> Use the given cookie string |
171 | -b, --begin <topicnum> Topic number at which to begin downloading |
172 | -e, --end <topicnum> Topic number at which to stop downloading |
173 | -l, --ln <ln_dir> Hard link email files into this directory |
174 | |
175 | Environmental variables: |
176 | |
177 | GG_COOKIE use as value for --cookie |
178 | |
179 | EOF |
180 | } |
181 | |
182 | cmd= |
183 | topic_id= |
184 | topic_begin= |
185 | topic_end= |
186 | fix_file= |
187 | |
188 | while [[ "$#" -gt 0 ]]; do |
189 | case "$1" in |
190 | -h|--help) show_help; exit;; |
191 | --version) show_version; exit;; |
192 | -v|--verbose) opt_verbose=1;; |
193 | -c|--cookie) cookie_str="$2"; shift;; |
194 | -b|--begin) topic_begin="$2"; shift;; |
195 | -e|--end) topic_end="$2"; shift;; |
196 | -l|--ln) ln_dir="$2"; shift;; |
197 | topics) cmd=topics;; |
198 | test) cmd=test;; |
199 | messages) cmd=messages; topic_id="$2"; shift;; |
200 | download) cmd=download; dest_dir="$2"; shift;; |
201 | fix_message) cmd=fix_message; fix_file="$2"; shift;; |
202 | *) if [[ -z "$group_id" ]]; then |
203 | group_id="$1" |
204 | else |
205 | unknown_option "$1" |
206 | fi;; |
207 | esac |
208 | shift |
209 | done |
210 | |
211 | if [[ "$cmd" == fix_message ]]; then |
212 | if [[ -z "$fix_file" ]]; then |
213 | show_help |
214 | exit 1 |
215 | fi |
216 | fix_message "$fix_file" |
217 | exit |
218 | fi |
219 | |
220 | if [[ -z "$group_id" ]]; then |
221 | show_help |
222 | exit 1 |
223 | fi |
224 | |
225 | case "$cmd" in |
226 | '') show_help; exit;; |
227 | test) check_permission && echo Success;; |
228 | topics) get_topics "$topic_begin" "$topic_end";; |
229 | download) download_mails "$topic_begin" "$topic_end";; |
230 | messages) get_messages "$topic_id";; |
231 | *) echo "Unknown command $cmd" >&2;; |
232 | esac |
233 |
Built with git-ssb-web