git ssb

0+

cel / ggscrape



Commit 1f4de2eeab8c60b37f065c88bd87aa3ba6c51e0d

Initial commit

Charles Lehner committed on 9/12/2014, 4:33:47 PM

Files changed

ggscrapeadded
ggscrapeView
@@ -1,0 +1,183 @@
1 +#!/bin/bash
2 +# vi: ts=4 sw=4 et
3 +#
4 +# ggscrape(1)
5 +# Download emails from a Google Groups
6 +#
7 +# Copyright (c) 2014 Charles Lehner
8 +# Released under the terms of the MIT License.
9 +
10 +VERSION=0.0.1
11 +BASE_URL='https://groups.google.com/forum/?_escaped_fragment_='
12 +
13 +EX_USAGE=64
14 +topic_range=100
15 +
16 +opt_verbose=
17 +group_id=
18 +dest_dir=
19 +cookie_str=
20 +
21 +if [[ -n "$GG_COOKIE" ]]; then
22 + cookie_str="$GG_COOKIE"
23 +fi
24 +
25 +debug_print() {
26 + if [[ -n $opt_verbose ]]; then
27 + echo $@ >&2
28 + fi
29 +}
30 +
31 +req() {
32 + debug_print query: "$1"
33 + curl -sN -b "$cookie_str" "$BASE_URL$@"
34 +}
35 +
36 +test_permission() {
37 + debug_print testing permission
38 + req "forum/${group_id}%5B1-1-false%5D" -I | grep -q '200 OK'
39 +}
40 +
41 +get_topics_single() {
42 + local start=$1
43 + local end=$2
44 +
45 + debug_print "get topics $group_id [$start-$end]"
46 +
47 + req "forum/${group_id}%5B${start}-${end}-false%5D" | sed -n \
48 + "s/^<i>Showing [^<]* of 0 topics<\/i>$//p;
49 + s/<tr>/\0\n/; /lastPostDate/ {
50 + s/.*lastPostDate\">\([^<]*\).*$/\1/m; P; D;
51 + };
52 + /subject/{
53 + s/.*href=\"[^\"]*\/\([^\"]*\)\"[^>]*>\([^<]*\).*/\1\n\2/; p;
54 + }" | sed '/^$/q99; N; N; s/\(.*\)\n\(.*\)\n\(.*\)/\1\t\3\t\2/'
55 + # return status 99 if no matches found
56 +}
57 +
58 +unknown_option() {
59 + echo "Unknown option $1" >&2
60 + exit $EX_USAGE
61 +}
62 +
63 +get_topics() {
64 + local start=$(($1))
65 + local end=$(($2))
66 + local temp_end
67 +
68 + if [[ -z "$start" ]]; then
69 + start=1
70 + fi
71 +
72 + debug_print "get all topics $group_id [$start-$end]"
73 +
74 + for ((i = start; i <= end || end == 0; i += topic_range)); do
75 + ((temp_end = (end && i+topic_range-1 > end) ? end : i+topic_range-1))
76 + get_topics_single $i $temp_end || break
77 + done
78 +}
79 +
80 +# get message ids in a topic
81 +get_messages() {
82 + local topic_id="$1"
83 + req "topic/${group_id}/${topic_id}" | sed -n\
84 + 's/.*<td class="subject"><a href="[^"]*'$topic_id'\/\([^"]*\)".*/\1/p'
85 +}
86 +
87 +download_message() {
88 + local topic_id="$1"
89 + local msg_id="$2"
90 + debug_print download topic $topic_id message $msg_id
91 + path="${dest_dir}/${group_id}${topic_id}${msg_id}.eml"
92 + if [[ -s "$path" ]]; then
93 + echo "file for message ${msg_id} already exists. skipping."
94 + else
95 + req "${group_id}/${topic_id}/${msg_id}" > "$path"
96 + fi
97 +}
98 +
99 +download_messages() {
100 + local topic_id="$1"
101 + debug_print download topic $topic_id
102 + get_messages "$topic_id" | while read msg_id _; do
103 + download_message "$topic_id" "$msg_id"
104 + done
105 +}
106 +
107 +download_mails() {
108 + local start=$1
109 + local end=$2
110 +
111 + get_topics "$start" "$end" | while read topic_id topic_title; do
112 + download_messages $topic_id
113 + done
114 +}
115 +
116 +show_version() {
117 + echo ggscrape $VERSION
118 +}
119 +
120 +show_help() {
121 + cat << EOF
122 +ggscrape. Download emails from a Google Group
123 +
124 +Usage:
125 + ggscrape <group_id> [test]
126 + ggscrape <group_id> topics
127 + ggscrape <group_id> messages <topic_id>
128 + ggscrape <group_id> download <directory>
129 +
130 +Options:
131 + -h, --help Show this screen
132 + --version Show version
133 + -v, --verbose Show debug info
134 + -c, --cookie <cookie> Use the given cookie string
135 + -b, --begin <topicnum> Topic number at which to begin downloading
136 + -e, --end <topicnum> Topic number at which to stop downloading
137 +
138 +Environmental variables:
139 +
140 + GG_COOKIE use as value for --cookie
141 +
142 +EOF
143 +}
144 +
145 +cmd=
146 +topic_id=
147 +topic_begin=
148 +topic_end=
149 +
150 +while [[ "$#" -gt 0 ]]; do
151 + case "$1" in
152 + -h|--help) show_help; exit;;
153 + --version) show_version; exit;;
154 + -v|--verbose) opt_verbose=1;;
155 + -c|--cookie) cookie_str="$2"; shift;;
156 + -b|--begin) topic_begin="$2"; shift;;
157 + -e|--end) topic_end="$2"; shift;;
158 + topics) cmd=topics;;
159 + test) cmd=test;;
160 + messages) cmd=messages; topic_id="$2"; shift;;
161 + download) cmd=download; dest_dir="$2"; shift;;
162 + *) if [[ -z "$group_id" ]]; then
163 + group_id="$1"
164 + else
165 + unknown_option "$1"
166 + fi;;
167 + esac
168 + shift
169 +done
170 +
171 +if [[ -z "$group_id" ]]; then
172 + show_help
173 + exit 1
174 +fi
175 +
176 +case "$cmd" in
177 + '') show_help; exit;;
178 + test) test_permission;;
179 + topics) get_topics "$topic_begin" "$topic_end";;
180 + download) download_mails "$topic_begin" "$topic_end";;
181 + messages) get_messages "$topic_id";;
182 + *) echo "Unknown command $cmd" >&2;;
183 +esac

Built with git-ssb-web