Expand individual sessions

This commit is contained in:
FliegendeWurst 2022-12-19 19:13:08 +01:00
parent cc7dcd6f9d
commit ff982a5723
6 changed files with 38 additions and 6 deletions

12
Cargo.lock generated
View File

@ -7,6 +7,7 @@ name = "KIT-ILIAS-downloader"
version = "0.3.6"
dependencies = [
"anyhow",
"async-recursion",
"atty",
"bytes",
"cfg-if",
@ -111,6 +112,17 @@ dependencies = [
"winapi",
]
[[package]]
name = "async-recursion"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cda8f4bcc10624c4e85bc66b3f452cca98cfa5ca002dc83a16aad2367641bea"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "atty"
version = "0.2.14"

View File

@ -36,3 +36,4 @@ bytes = "1.0.1"
toml = "0.5.8"
tempfile = "3.2.0"
ego-tree = "0.6.2"
async-recursion = "1.0.0"

View File

@ -5,6 +5,7 @@ use std::{collections::HashMap, error::Error as _, io::Write, sync::Arc};
use anyhow::{anyhow, Context, Result};
use cookie_store::CookieStore;
use once_cell::sync::Lazy;
use regex::Regex;
use reqwest::{Client, IntoUrl, Proxy, Url};
use reqwest_cookie_store::CookieStoreMutex;
use scraper::{ElementRef, Html, Selector};
@ -267,8 +268,8 @@ impl ILIAS {
.collect()
}
/// Returns subfolders and the main text in a course/folder/personal desktop.
pub async fn get_course_content(&self, url: &URL) -> Result<(Vec<Result<Object>>, Option<String>)> {
/// Returns subfolders, the main text in a course/folder/personal desktop and all links on the page.
pub async fn get_course_content(&self, url: &URL) -> Result<(Vec<Result<Object>>, Option<String>, Vec<String>)> {
let html = self.get_html(&url.url).await?;
let main_text = if let Some(el) = html.select(&IL_CONTENT_CONTAINER).next() {
@ -281,7 +282,7 @@ impl ILIAS {
} else {
None
};
Ok((ILIAS::get_items(&html), main_text))
Ok((ILIAS::get_items(&html), main_text, html.select(&LINKS).flat_map(|x| x.value().attr("href").map(|x| x.to_owned())).collect()))
}
pub async fn get_course_content_tree(&self, ref_id: &str, cmd_node: &str) -> Result<Vec<Object>> {

View File

@ -28,11 +28,13 @@ pub async fn download(path: PathBuf, ilias: Arc<ILIAS>, url: &URL, name: &str) -
return Ok(()); // ignore groups we are not in
}
warning!(name, "falling back to incomplete course content extractor!", e);
ilias.get_course_content(&url).await? // TODO: perhaps don't download almost the same content 3x
let (items, main_text, _) = ilias.get_course_content(&url).await?;
(items, main_text)
},
}
} else {
ilias.get_course_content(&url).await?
let (items, main_text, _) = ilias.get_course_content(&url).await?;
(items, main_text)
};
if ilias.opt.save_ilias_pages {
if let Some(s) = content.1.as_ref() {

View File

@ -1,6 +1,9 @@
use std::{collections::HashSet, path::Path, sync::Arc};
use anyhow::{Context, Result};
use async_recursion::async_recursion;
use once_cell::sync::Lazy;
use regex::Regex;
use crate::{
process_gracefully,
@ -10,8 +13,20 @@ use crate::{
use super::{ILIAS, URL};
static EXPAND_LINK: Lazy<Regex> = Lazy::new(|| Regex::new("expand=\\d").unwrap());
#[async_recursion]
pub async fn download(path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
let content = ilias.get_course_content(&url).await?;
// expand all sessions
for href in content.2 {
// link format: ilias.php?ref_id=1943526&expand=2602906&cmd=view&cmdClass=ilobjfoldergui&cmdNode=x1:nk&baseClass=ilrepositorygui#lg_div_1948579_pref_1943526
if EXPAND_LINK.is_match(&href) {
return download(path, ilias, &URL::from_href(&href)?).await;
}
}
if ilias.opt.save_ilias_pages {
if let Some(s) = content.1.as_ref() {
let path = path.join("folder.html");
@ -20,6 +35,7 @@ pub async fn download(path: &Path, ilias: Arc<ILIAS>, url: &URL) -> Result<()> {
.context("failed to write folder page html")?;
}
}
let mut names = HashSet::new();
for item in content.0 {
let item = item?;

View File

@ -14,7 +14,7 @@ impl IliasIgnore {
let mut prefix = Vec::new();
// example scenario:
// path = /KIT/ILIAS/SS 23/Next Generation Internet
// iliasignore in ILIAS/.iliasignore: prefix = SS 23/Next Generation Internet
// iliasignore in ILIAS/.iliasignore: prefix = SS 23/Next Generation Internet/
// iliasignore in Next Generation Internet/.iliasignore: prefix = ""
loop {
let (ignore, error) = Gitignore::new(path.join(".iliasignore"));