git ssb

0+

cel / ggscrape



Tree: 806ab97eba21eba1b9b85de34dca1fbeb197f602

Files: 806ab97eba21eba1b9b85de34dca1fbeb197f602 / ggscrape

5655 bytesRaw
1#!/bin/bash
2# vi: ts=4 sw=4 et
3#
4# ggscrape(1)
5# Download emails from a Google Groups
6#
7# Copyright (c) 2014 Charles Lehner
8# Released under the terms of the MIT License.
9
10VERSION=1.0.0
11BASE_URL='https://groups.google.com/forum/'
12
13EX_USAGE=64
14topic_range=100
15
16opt_verbose=
17group_id=
18dest_dir=
19ln_dir=
20cookie_str=
21
22if [[ -n "$GG_COOKIE" ]]; then
23 cookie_str="$GG_COOKIE"
24fi
25
26debug_print() {
27 if [[ -n $opt_verbose ]]; then
28 echo $@ >&2
29 fi
30}
31
32req() {
33 debug_print req: "$1"
34 curl -sN -b "$cookie_str" "$BASE_URL$@"
35}
36
37req_fragment() {
38 req "?_escaped_fragment_=$@"
39}
40
41check_permission() {
42 debug_print testing permission
43 if ! req_fragment "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK'
44 then
45 echo 'Unable to access group.' >&2
46 return 1
47 fi
48}
49
50get_topics_single() {
51 local start=$1
52 local end=$2
53
54 debug_print "get topics $group_id [$start-$end]"
55
56 req_fragment "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \
57 "s/^<i>Showing [^<]* of 0 topics<\/i>$//p;
58 s/<tr>/\0\n/; /lastPostDate/ {
59 s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D;
60 };
61 /subject/{
62 s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p;
63 }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/'
64 # return status 99 if no matches found
65}
66
67unknown_option() {
68 echo "Unknown option $1" >&2
69 exit $EX_USAGE
70}
71
72get_topics() {
73 local start=$(($1))
74 local end=$(($2))
75 local temp_end
76
77 if ((start==0)); then
78 start=1
79 fi
80
81 check_permission || return 1
82
83 debug_print "get all topics $group_id [$start-$end]"
84
85 for ((i = start; i <= end || end == 0; i += topic_range)); do
86 ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1))
87 get_topics_single $i $temp_end || break
88 done
89}
90
91# get message ids in a topic
92get_messages() {
93 local topic_id="$1"
94 req_fragment "topic/${group_id}/${topic_id}" | sed -n\
95 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p'
96}
97
98fix_message() {
99 local file="$1"
100 # Some messages have two duplicate sets of headers, the second set starting
101 # with some X-Google stuff. Remove the second set of headers if they are
102 # present.
103 if sed -n '/^\r$/{ n; /^X-Google-Groups:/q; q 1; }' "$file"; then
104 debug_print 'Removing duplicate headers'
105 sed -i~ '/^X-Google-Groups:/,/^\r$/d' "$file" && rm "$file~"
106 fi
107}
108
109download_message() {
110 local topic_id="$1"
111 local msg_id="$2"
112 debug_print download topic $topic_id message $msg_id
113 path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml"
114 if [[ -s "$path" ]]; then
115 echo "message ${topic_id}${msg_id} already downloaded. skipping."
116 else
117 echo "message ${topic_id}${msg_id} downloading."
118 temp=$(mktemp)
119 if req "message/raw?msg=${group_id}/${topic_id}/${msg_id}" -o "$temp"
120 then
121 fix_message "$temp"
122 mv "$temp" "$path"
123 if [[ -n "$ln_dir" ]]; then
124 ln "$path" "$ln_dir"
125 fi
126 else
127 echo "message ${topic_id}${msg_id} failed to download." >&2
128 rm "$temp"
129 fi
130 fi
131}
132
133download_messages() {
134 local topic_id="$1"
135 debug_print download topic $topic_id
136 get_messages "$topic_id" | while read msg_id _; do
137 download_message "$topic_id" "$msg_id"
138 done
139}
140
141download_mails() {
142 local start=$1
143 local end=$2
144
145 mkdir -p "$dest_dir" || exit 1
146 get_topics "$start" "$end" | while read topic_id topic_title; do
147 download_messages $topic_id
148 done
149}
150
151show_version() {
152 echo ggscrape $VERSION
153}
154
155show_help() {
156 cat << EOF
157ggscrape. Download emails from a Google Group
158
159Usage:
160 ggscrape <group_id> test
161 ggscrape <group_id> topics
162 ggscrape <group_id> messages <topic_id>
163 ggscrape <group_id> download <dest_dir>
164 ggscrape fix_message <file>
165
166Options:
167 -h, --help Show this screen
168 --version Show version
169 -v, --verbose Show debug info
170 -c, --cookie <cookie> Use the given cookie string
171 -b, --begin <topicnum> Topic number at which to begin downloading
172 -e, --end <topicnum> Topic number at which to stop downloading
173 -l, --ln <ln_dir> Hard link email files into this directory
174
175Environmental variables:
176
177 GG_COOKIE use as value for --cookie
178
179EOF
180}
181
182cmd=
183topic_id=
184topic_begin=
185topic_end=
186fix_file=
187
188while [[ "$#" -gt 0 ]]; do
189 case "$1" in
190 -h|--help) show_help; exit;;
191 --version) show_version; exit;;
192 -v|--verbose) opt_verbose=1;;
193 -c|--cookie) cookie_str="$2"; shift;;
194 -b|--begin) topic_begin="$2"; shift;;
195 -e|--end) topic_end="$2"; shift;;
196 -l|--ln) ln_dir="$2"; shift;;
197 topics) cmd=topics;;
198 test) cmd=test;;
199 messages) cmd=messages; topic_id="$2"; shift;;
200 download) cmd=download; dest_dir="$2"; shift;;
201 fix_message) cmd=fix_message; fix_file="$2"; shift;;
202 *) if [[ -z "$group_id" ]]; then
203 group_id="$1"
204 else
205 unknown_option "$1"
206 fi;;
207 esac
208 shift
209done
210
211if [[ "$cmd" == fix_message ]]; then
212 if [[ -z "$fix_file" ]]; then
213 show_help
214 exit 1
215 fi
216 fix_message "$fix_file"
217 exit
218fi
219
220if [[ -z "$group_id" ]]; then
221 show_help
222 exit 1
223fi
224
225case "$cmd" in
226 '') show_help; exit;;
227 test) check_permission && echo Success;;
228 topics) get_topics "$topic_begin" "$topic_end";;
229 download) download_mails "$topic_begin" "$topic_end";;
230 messages) get_messages "$topic_id";;
231 *) echo "Unknown command $cmd" >&2;;
232esac
233

Built with git-ssb-web