Experimental forum download

This commit is contained in:
FliegendeWurst 2020-04-22 12:03:36 +02:00
parent f8a9350386
commit 1aca252f8c

View File

@ -18,6 +18,7 @@ use std::io;
use std::panic; use std::panic;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration;
mod errors; mod errors;
use errors::*; use errors::*;
@ -55,6 +56,9 @@ enum Object {
name: String, name: String,
url: URL url: URL
}, },
Thread {
url: URL
},
Wiki { Wiki {
name: String, name: String,
url: URL url: URL
@ -85,6 +89,7 @@ impl Object {
Folder { name, .. } => &name, Folder { name, .. } => &name,
File { name, .. } => &name, File { name, .. } => &name,
Forum { name, .. } => &name, Forum { name, .. } => &name,
Thread { url } => &url.thr_pk.as_ref().unwrap(),
Wiki { name, .. } => &name, Wiki { name, .. } => &name,
ExerciseHandler { name, .. } => &name, ExerciseHandler { name, .. } => &name,
PluginDispatch { name, .. } => &name, PluginDispatch { name, .. } => &name,
@ -93,12 +98,28 @@ impl Object {
} }
} }
fn url(&self) -> &URL {
match self {
Course { url, .. } => &url,
Folder { url, .. } => &url,
File { url, .. } => &url,
Forum { url, .. } => &url,
Thread { url } => &url,
Wiki { url, .. } => &url,
ExerciseHandler { url, .. } => &url,
PluginDispatch { url, .. } => &url,
Video { .. } => unreachable!(),
Generic { url, .. } => &url,
}
}
fn kind(&self) -> &str { fn kind(&self) -> &str {
match self { match self {
Course { .. } => "course", Course { .. } => "course",
Folder { .. } => "folder", Folder { .. } => "folder",
File { .. } => "file", File { .. } => "file",
Forum { .. } => "forum", Forum { .. } => "forum",
Thread { .. } => "thread",
Wiki { .. } => "wiki", Wiki { .. } => "wiki",
ExerciseHandler { .. } => "exercise handler", ExerciseHandler { .. } => "exercise handler",
PluginDispatch { .. } => "plugin dispatch", PluginDispatch { .. } => "plugin dispatch",
@ -111,6 +132,12 @@ impl Object {
let mut name = link.text().collect::<String>().replace('/', "-"); let mut name = link.text().collect::<String>().replace('/', "-");
let url = URL::from_href(link.value().attr("href").unwrap()); let url = URL::from_href(link.value().attr("href").unwrap());
if url.thr_pk.is_some() {
return Thread {
url
};
}
if url.url.starts_with("https://ilias.studium.kit.edu/goto.php") { if url.url.starts_with("https://ilias.studium.kit.edu/goto.php") {
let item_prop = Selector::parse("span.il_ItemProperty").unwrap(); let item_prop = Selector::parse("span.il_ItemProperty").unwrap();
let mut item_props = item.select(&item_prop); let mut item_props = item.select(&item_prop);
@ -151,6 +178,8 @@ struct URL {
cmdNode: Option<String>, cmdNode: Option<String>,
cmd: Option<String>, cmd: Option<String>,
forwardCmd: Option<String>, forwardCmd: Option<String>,
thr_pk: Option<String>,
pos_pk: Option<String>,
ref_id: String, ref_id: String,
} }
@ -163,6 +192,8 @@ impl URL {
let mut cmdNode = None; let mut cmdNode = None;
let mut cmd = None; let mut cmd = None;
let mut forwardCmd = None; let mut forwardCmd = None;
let mut thr_pk = None;
let mut pos_pk = None;
let mut ref_id = String::new(); let mut ref_id = String::new();
for (k, v) in url.query_pairs() { for (k, v) in url.query_pairs() {
match &*k { match &*k {
@ -171,6 +202,8 @@ impl URL {
"cmdNode" => cmdNode = Some(v.into_owned()), "cmdNode" => cmdNode = Some(v.into_owned()),
"cmd" => cmd = Some(v.into_owned()), "cmd" => cmd = Some(v.into_owned()),
"forwardCmd" => forwardCmd = Some(v.into_owned()), "forwardCmd" => forwardCmd = Some(v.into_owned()),
"thr_pk" => thr_pk = Some(v.into_owned()),
"pos_pk" => pos_pk = Some(v.into_owned()),
"ref_id" => ref_id = v.into_owned(), "ref_id" => ref_id = v.into_owned(),
_ => {} _ => {}
} }
@ -182,6 +215,8 @@ impl URL {
cmdNode, cmdNode,
cmd, cmd,
forwardCmd, forwardCmd,
thr_pk,
pos_pk,
ref_id ref_id
} }
} }
@ -194,6 +229,7 @@ impl ILIAS {
let client = Client::builder() let client = Client::builder()
.cookie_store(true) .cookie_store(true)
.user_agent("KIT-ILIAS-downloader/0.1.0") .user_agent("KIT-ILIAS-downloader/0.1.0")
.max_idle_per_host(1)
.build()?; .build()?;
let this = ILIAS { let this = ILIAS {
opt, client, user, pass opt, client, user, pass
@ -304,13 +340,16 @@ async fn main() {
while let Some((path, obj)) = queue.pop_front() { while let Some((path, obj)) = queue.pop_front() {
let ilias = Arc::clone(&ilias); let ilias = Arc::clone(&ilias);
task::spawn(async { task::spawn(async {
while *TASKS_RUNNING.lock() > 1 {
tokio::time::delay_for(Duration::from_millis(100)).await;
}
*TASKS_RUNNING.lock() += 1; *TASKS_RUNNING.lock() += 1;
process(ilias, path, obj).await; process(ilias, path, obj).await;
*TASKS_RUNNING.lock() -= 1; *TASKS_RUNNING.lock() -= 1;
}); });
} }
while *TASKS_RUNNING.lock() > 0 { while *TASKS_RUNNING.lock() > 0 {
tokio::time::delay_for(std::time::Duration::from_millis(500)).await; tokio::time::delay_for(Duration::from_millis(500)).await;
} }
} }
@ -339,6 +378,9 @@ fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::F
path.push(item.name()); path.push(item.name());
let ilias = Arc::clone(&ilias); let ilias = Arc::clone(&ilias);
task::spawn(async { task::spawn(async {
while *TASKS_RUNNING.lock() > 1 {
tokio::time::delay_for(Duration::from_millis(100)).await;
}
*TASKS_RUNNING.lock() += 1; *TASKS_RUNNING.lock() += 1;
process(ilias, path, item).await; process(ilias, path, item).await;
*TASKS_RUNNING.lock() -= 1; *TASKS_RUNNING.lock() -= 1;
@ -357,6 +399,9 @@ fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::F
path.push(item.name()); path.push(item.name());
let ilias = Arc::clone(&ilias); let ilias = Arc::clone(&ilias);
task::spawn(async { task::spawn(async {
while *TASKS_RUNNING.lock() > 1 {
tokio::time::delay_for(Duration::from_millis(100)).await;
}
*TASKS_RUNNING.lock() += 1; *TASKS_RUNNING.lock() += 1;
process(ilias, path, item).await; process(ilias, path, item).await;
*TASKS_RUNNING.lock() -= 1; *TASKS_RUNNING.lock() -= 1;
@ -425,6 +470,9 @@ fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::F
}; };
let ilias = Arc::clone(&ilias); let ilias = Arc::clone(&ilias);
task::spawn(async { task::spawn(async {
while *TASKS_RUNNING.lock() > 1 {
tokio::time::delay_for(Duration::from_millis(100)).await;
}
*TASKS_RUNNING.lock() += 1; *TASKS_RUNNING.lock() += 1;
process(ilias, path, video).await; process(ilias, path, video).await;
*TASKS_RUNNING.lock() -= 1; *TASKS_RUNNING.lock() -= 1;
@ -476,6 +524,103 @@ fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::F
let mut file = BufWriter::new(file); let mut file = BufWriter::new(file);
tokio::io::copy(&mut reader, &mut file).await.unwrap(); tokio::io::copy(&mut reader, &mut file).await.unwrap();
}, },
Forum { url, .. } => {
if let Err(e) = fs::create_dir(&path) {
if e.kind() != io::ErrorKind::AlreadyExists {
println!("error: {:?}", e);
}
}
let url = format!("{}ilias.php?ref_id={}&cmd=showThreads&cmdClass=ilrepositorygui&cmdNode=uf&baseClass=ilrepositorygui", ILIAS_URL, url.ref_id);
let html = {
let a = Selector::parse("a").unwrap();
let data = ilias.download(&url);
let html_text = data.await.unwrap().text().await.unwrap();
let url = {
let html = Html::parse_document(&html_text);
//https://ilias.studium.kit.edu/ilias.php?ref_id=122&cmdClass=ilobjforumgui&frm_tt_e39_122_trows=800&cmd=showThreads&cmdNode=uf:lg&baseClass=ilrepositorygui
let url = {
let t800 = html.select(&a).filter(|x| x.value().attr("href").unwrap_or("").contains("trows=800")).next().expect("can't find forum thread count selector");
t800.value().attr("href").unwrap()
};
format!("{}{}", ILIAS_URL, url)
};
let data = ilias.download(&url);
let html = data.await.unwrap().text().await.unwrap();
Html::parse_document(&html)
};
let a = Selector::parse("a").unwrap();
let tr = Selector::parse("tr").unwrap();
let td = Selector::parse("td").unwrap();
for row in html.select(&tr) {
let cells = row.select(&td).collect::<Vec<_>>();
if cells.len() != 6 {
continue;
}
let link = cells[1].select(&a).next().unwrap();
let object = Object::from_link(link, link);
let mut path = path.clone();
let name = format!("{}_{}", object.url().thr_pk.as_ref().expect("thr_pk not found for thread"), link.text().collect::<String>().replace('/', "-").trim());
path.push(name);
let ilias = Arc::clone(&ilias);
task::spawn(async {
while *TASKS_RUNNING.lock() > 1 {
tokio::time::delay_for(Duration::from_millis(100)).await;
}
*TASKS_RUNNING.lock() += 1;
process(ilias, path, object).await;
*TASKS_RUNNING.lock() -= 1;
});
}
},
Thread { url } => {
if let Err(e) = fs::create_dir(&path) {
if e.kind() != io::ErrorKind::AlreadyExists {
println!("error: {:?}", e);
}
// skip already downloaded
// TODO: compare modification date
if !ilias.opt.force {
return;
}
}
let url = format!("{}{}", ILIAS_URL, url.url);
let data = ilias.download(&url);
let html = data.await.unwrap().text().await.unwrap();
let html = Html::parse_document(&html);
let post = Selector::parse(".ilFrmPostRow").unwrap();
let post_container = Selector::parse(".ilFrmPostContentContainer").unwrap();
let post_title = Selector::parse(".ilFrmPostTitle").unwrap();
let post_content = Selector::parse(".ilFrmPostContent").unwrap();
let span_small = Selector::parse("span.small").unwrap();
let a = Selector::parse("a").unwrap();
for post in html.select(&post) {
let title = post.select(&post_title).next().unwrap().text().collect::<String>().replace('/', "-");
let author = post.select(&span_small).next().unwrap();
let author = author.text().collect::<String>();
let author = author.trim().split('|').nth(1).unwrap().trim();
let container = post.select(&post_container).next().unwrap();
let link = container.select(&a).next().unwrap();
let name = format!("{}_{}_{}.html", link.value().attr("name").unwrap(), author, title.trim());
let data = post.select(&post_content).next().unwrap();
let data = data.inner_html();
let mut path = path.clone();
path.push(name);
let ilias = Arc::clone(&ilias);
task::spawn(async move {
while *TASKS_RUNNING.lock() > 1 {
tokio::time::delay_for(Duration::from_millis(100)).await;
}
*TASKS_RUNNING.lock() += 1;
if ilias.opt.verbose > 1 {
println!("Writing to {:?}..", path);
}
let file = AsyncFile::create(&path).await.unwrap();
let mut file = BufWriter::new(file);
tokio::io::copy(&mut data.as_bytes(), &mut file).await.unwrap();
*TASKS_RUNNING.lock() -= 1;
});
}
},
o => { o => {
if ilias.opt.verbose > 0 { if ilias.opt.verbose > 0 {
println!("ignoring {:#?}", o) println!("ignoring {:#?}", o)