mirror of
https://github.com/FliegendeWurst/KIT-ILIAS-downloader.git
synced 2024-08-28 04:04:18 +00:00
Fix content tree extraction
This commit is contained in:
parent
3af01403bd
commit
97ee02756d
152
src/main.rs
152
src/main.rs
@ -72,7 +72,7 @@ enum Object {
|
|||||||
url: URL
|
url: URL
|
||||||
},
|
},
|
||||||
Video {
|
Video {
|
||||||
url: String,
|
url: URL,
|
||||||
},
|
},
|
||||||
Generic {
|
Generic {
|
||||||
name: String,
|
name: String,
|
||||||
@ -93,7 +93,7 @@ impl Object {
|
|||||||
Wiki { name, .. } => &name,
|
Wiki { name, .. } => &name,
|
||||||
ExerciseHandler { name, .. } => &name,
|
ExerciseHandler { name, .. } => &name,
|
||||||
PluginDispatch { name, .. } => &name,
|
PluginDispatch { name, .. } => &name,
|
||||||
Video { url } => &url,
|
Video { url } => &url.url,
|
||||||
Generic { name, .. } => &name,
|
Generic { name, .. } => &name,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -108,7 +108,7 @@ impl Object {
|
|||||||
Wiki { url, .. } => &url,
|
Wiki { url, .. } => &url,
|
||||||
ExerciseHandler { url, .. } => &url,
|
ExerciseHandler { url, .. } => &url,
|
||||||
PluginDispatch { url, .. } => &url,
|
PluginDispatch { url, .. } => &url,
|
||||||
Video { .. } => unreachable!(),
|
Video { url } => &url,
|
||||||
Generic { url, .. } => &url,
|
Generic { url, .. } => &url,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -129,8 +129,8 @@ impl Object {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn from_link(item: ElementRef, link: ElementRef) -> Self {
|
fn from_link(item: ElementRef, link: ElementRef) -> Self {
|
||||||
let mut name = link.text().collect::<String>().replace('/', "-");
|
let mut name = link.text().collect::<String>().replace('/', "-").trim().to_owned();
|
||||||
let url = URL::from_href(link.value().attr("href").unwrap());
|
let mut url = URL::from_href(link.value().attr("href").unwrap());
|
||||||
|
|
||||||
if url.thr_pk.is_some() {
|
if url.thr_pk.is_some() {
|
||||||
return Thread {
|
return Thread {
|
||||||
@ -139,6 +139,61 @@ impl Object {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if url.url.starts_with("https://ilias.studium.kit.edu/goto.php") {
|
if url.url.starts_with("https://ilias.studium.kit.edu/goto.php") {
|
||||||
|
if url.target.as_ref().map(|x| x.starts_with("wiki_")).unwrap_or(false) {
|
||||||
|
return Wiki {
|
||||||
|
name,
|
||||||
|
url // TODO: insert ref_id here
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if url.target.as_ref().map(|x| x.starts_with("root_")).unwrap_or(false) {
|
||||||
|
// magazine link
|
||||||
|
return Generic {
|
||||||
|
name,
|
||||||
|
url
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if url.target.as_ref().map(|x| x.starts_with("crs_")).unwrap_or(false) {
|
||||||
|
let ref_id = url.target.as_ref().unwrap().split('_').nth(1).unwrap();
|
||||||
|
url.ref_id = ref_id.to_owned();
|
||||||
|
return Course {
|
||||||
|
name,
|
||||||
|
url
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if url.target.as_ref().map(|x| x.starts_with("frm_")).unwrap_or(false) {
|
||||||
|
// TODO: extract post link? (this codepath should only be hit when parsing the content tree)
|
||||||
|
let ref_id = url.target.as_ref().unwrap().split('_').nth(1).unwrap();
|
||||||
|
url.ref_id = ref_id.to_owned();
|
||||||
|
return Forum {
|
||||||
|
name,
|
||||||
|
url
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if url.target.as_ref().map(|x| x.starts_with("lm_")).unwrap_or(false) {
|
||||||
|
// fancy interactive task
|
||||||
|
return Generic {
|
||||||
|
name,
|
||||||
|
url
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if url.target.as_ref().map(|x| x.starts_with("fold_")).unwrap_or(false) {
|
||||||
|
let ref_id = url.target.as_ref().unwrap().split('_').nth(1).unwrap();
|
||||||
|
url.ref_id = ref_id.to_owned();
|
||||||
|
return Folder {
|
||||||
|
name,
|
||||||
|
url
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if url.target.as_ref().map(|x| x.starts_with("file_")).unwrap_or(false) {
|
||||||
|
let target = url.target.as_ref().unwrap();
|
||||||
|
if !target.ends_with("download") {
|
||||||
|
// download page containing metadata
|
||||||
|
// TODO: perhaps process that? not really needed
|
||||||
|
return Generic {
|
||||||
|
name,
|
||||||
|
url
|
||||||
|
};
|
||||||
|
} else {
|
||||||
let item_prop = Selector::parse("span.il_ItemProperty").unwrap();
|
let item_prop = Selector::parse("span.il_ItemProperty").unwrap();
|
||||||
let mut item_props = item.select(&item_prop);
|
let mut item_props = item.select(&item_prop);
|
||||||
let ext = item_props.next().unwrap();
|
let ext = item_props.next().unwrap();
|
||||||
@ -150,20 +205,23 @@ impl Object {
|
|||||||
}
|
}
|
||||||
return File { name: format!("{}.{}", name, ext.text().collect::<String>().trim()), url };
|
return File { name: format!("{}.{}", name, ext.text().collect::<String>().trim()), url };
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
return Generic { name, url };
|
||||||
|
}
|
||||||
|
|
||||||
if url.cmd.as_ref().map(|x| &**x) == Some("showThreads") {
|
if url.cmd.as_ref().map(|x| &**x) == Some("showThreads") {
|
||||||
return Forum { name, url };
|
return Forum { name, url };
|
||||||
}
|
}
|
||||||
|
|
||||||
match &*url.baseClass {
|
match &*url.baseClass.to_ascii_lowercase() {
|
||||||
"ilExerciseHandlerGUI" => ExerciseHandler { name, url },
|
"ilexercisehandlergui" => ExerciseHandler { name, url },
|
||||||
"ililWikiHandlerGUI" => Wiki { name, url },
|
"ililwikihandlergui" => Wiki { name, url },
|
||||||
"ilrepositorygui" => match url.cmd.as_deref() {
|
"ilrepositorygui" => match url.cmd.as_deref() {
|
||||||
Some("view") => Folder { name, url },
|
Some("view") => Folder { name, url },
|
||||||
Some(_) => Generic { name, url },
|
Some(_) => Generic { name, url },
|
||||||
None => Course { name, url },
|
None => Course { name, url },
|
||||||
},
|
},
|
||||||
"ilObjPluginDispatchGUI" => PluginDispatch { name, url },
|
"ilobjplugindispatchgui" => PluginDispatch { name, url },
|
||||||
_ => Generic { name, url }
|
_ => Generic { name, url }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -181,10 +239,26 @@ struct URL {
|
|||||||
thr_pk: Option<String>,
|
thr_pk: Option<String>,
|
||||||
pos_pk: Option<String>,
|
pos_pk: Option<String>,
|
||||||
ref_id: String,
|
ref_id: String,
|
||||||
|
target: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(non_snake_case)]
|
#[allow(non_snake_case)]
|
||||||
impl URL {
|
impl URL {
|
||||||
|
fn raw(url: String) -> Self {
|
||||||
|
URL {
|
||||||
|
url,
|
||||||
|
baseClass: String::new(),
|
||||||
|
cmdClass: None,
|
||||||
|
cmdNode: None,
|
||||||
|
cmd: None,
|
||||||
|
forwardCmd: None,
|
||||||
|
thr_pk: None,
|
||||||
|
pos_pk: None,
|
||||||
|
ref_id: String::new(),
|
||||||
|
target: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn from_href(href: &str) -> Self {
|
fn from_href(href: &str) -> Self {
|
||||||
let url = Url::parse(&format!("http://domain/{}", href)).unwrap();
|
let url = Url::parse(&format!("http://domain/{}", href)).unwrap();
|
||||||
let mut baseClass = String::new();
|
let mut baseClass = String::new();
|
||||||
@ -195,6 +269,7 @@ impl URL {
|
|||||||
let mut thr_pk = None;
|
let mut thr_pk = None;
|
||||||
let mut pos_pk = None;
|
let mut pos_pk = None;
|
||||||
let mut ref_id = String::new();
|
let mut ref_id = String::new();
|
||||||
|
let mut target = None;
|
||||||
for (k, v) in url.query_pairs() {
|
for (k, v) in url.query_pairs() {
|
||||||
match &*k {
|
match &*k {
|
||||||
"baseClass" => baseClass = v.into_owned(),
|
"baseClass" => baseClass = v.into_owned(),
|
||||||
@ -205,6 +280,7 @@ impl URL {
|
|||||||
"thr_pk" => thr_pk = Some(v.into_owned()),
|
"thr_pk" => thr_pk = Some(v.into_owned()),
|
||||||
"pos_pk" => pos_pk = Some(v.into_owned()),
|
"pos_pk" => pos_pk = Some(v.into_owned()),
|
||||||
"ref_id" => ref_id = v.into_owned(),
|
"ref_id" => ref_id = v.into_owned(),
|
||||||
|
"target" => target = Some(v.into_owned()),
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -217,7 +293,8 @@ impl URL {
|
|||||||
forwardCmd,
|
forwardCmd,
|
||||||
thr_pk,
|
thr_pk,
|
||||||
pos_pk,
|
pos_pk,
|
||||||
ref_id
|
ref_id,
|
||||||
|
target,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -301,14 +378,19 @@ impl ILIAS {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn get_html(&self, url: &str) -> Result<Html> {
|
async fn get_html(&self, url: &str) -> Result<Html> {
|
||||||
let text = self.client.get(url).send().await?.text().await?;
|
let text = if url.starts_with("http") || url.starts_with("ilias.studium.kit.edu") {
|
||||||
|
self.client.get(url).send().await?.text().await?
|
||||||
|
} else {
|
||||||
|
let url = format!("{}{}", ILIAS_URL, url);
|
||||||
|
self.client.get(&url).send().await?.text().await?
|
||||||
|
};
|
||||||
Ok(Html::parse_document(&text))
|
Ok(Html::parse_document(&text))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_html_fragment(&self, url: &str) -> Result<Html> {
|
async fn get_html_fragment(&self, url: &str) -> Result<Html> {
|
||||||
let text = self.client.get(url).send().await?.text().await?;
|
let text = self.client.get(url).send().await?.text().await?;
|
||||||
let html = Html::parse_fragment(&text);
|
let html = Html::parse_fragment(&text);
|
||||||
// TODO: have this above too
|
// TODO: have this in get_html too
|
||||||
if html.select(&alert_danger).next().is_some() {
|
if html.select(&alert_danger).next().is_some() {
|
||||||
//println!("{}", text);
|
//println!("{}", text);
|
||||||
Err("ILIAS error".into())
|
Err("ILIAS error".into())
|
||||||
@ -318,16 +400,20 @@ impl ILIAS {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn get_course_content(&self, url: &URL) -> Result<Vec<Object>> {
|
async fn get_course_content(&self, url: &URL) -> Result<Vec<Object>> {
|
||||||
let html = self.get_html(&format!("{}{}", ILIAS_URL, url.url)).await?;
|
let html = self.get_html(&url.url).await?;
|
||||||
Ok(ILIAS::get_items(&html)?)
|
Ok(ILIAS::get_items(&html)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_course_content_tree(&self, ref_id: &str) -> Result<Vec<Object>> {
|
async fn get_course_content_tree(&self, ref_id: &str, cmd_node: &str) -> Result<Vec<Object>> {
|
||||||
|
// TODO: this magically does not return sub-folders
|
||||||
|
// opening the same url in browser does show sub-folders..
|
||||||
let url = format!(
|
let url = format!(
|
||||||
"{}ilias.php?ref_id={}&cmdClass=ilobjrootfoldergui&cmd=showRepTree&cmdNode=uf:o4&baseClass=ilRepositoryGUI&cmdMode=asynch&exp_cmd=getNodeAsync&node_id=exp_node_rep_exp_{}&exp_cont=il_expl2_jstree_cont_rep_exp&searchterm=",
|
"{}ilias.php?ref_id={}&cmdClass=ilobjcoursegui&cmd=showRepTree&cmdNode={}&baseClass=ilRepositoryGUI&cmdMode=asynch&exp_cmd=getNodeAsync&node_id=exp_node_rep_exp_{}&exp_cont=il_expl2_jstree_cont_rep_exp&searchterm=",
|
||||||
ILIAS_URL, ref_id, ref_id
|
ILIAS_URL, ref_id, cmd_node, ref_id
|
||||||
);
|
);
|
||||||
|
if self.opt.verbose > 0 {
|
||||||
println!("Loading {:?}..", url);
|
println!("Loading {:?}..", url);
|
||||||
|
}
|
||||||
let html = self.get_html_fragment(&url).await?;
|
let html = self.get_html_fragment(&url).await?;
|
||||||
let mut items = Vec::new();
|
let mut items = Vec::new();
|
||||||
for link in html.select(&a) {
|
for link in html.select(&a) {
|
||||||
@ -345,7 +431,11 @@ impl ILIAS {
|
|||||||
if self.opt.verbose > 0 {
|
if self.opt.verbose > 0 {
|
||||||
println!("Downloading {}", url);
|
println!("Downloading {}", url);
|
||||||
}
|
}
|
||||||
|
if url.starts_with("http") || url.starts_with("ilias.studium.kit.edu") {
|
||||||
Ok(self.client.get(url).send().await?)
|
Ok(self.client.get(url).send().await?)
|
||||||
|
} else {
|
||||||
|
Ok(self.client.get(&format!("{}{}", ILIAS_URL, url)).send().await?)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -367,9 +457,8 @@ async fn main() {
|
|||||||
std::process::exit(77);
|
std::process::exit(77);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
// this is useless, further requests still error
|
// need this to get the content tree
|
||||||
//println!("Loading root..");
|
let _ = ilias.client.get("https://ilias.studium.kit.edu/ilias.php?baseClass=ilRepositoryGUI&cmd=frameset&set_mode=tree&ref_id=1").send().await;
|
||||||
//ilias.client.get("https://ilias.studium.kit.edu/ilias.php?ref_id=1&cmd=showRepTree&cmdClass=ilobjrootfoldergui&cmdNode=uf:o4&baseClass=ilRepositoryGUI&cmdMode=asynch&exp_cmd=getNodeAsync&node_id=&exp_cont=il_expl2_jstree_cont_rep_exp&searchterm=").send().await.unwrap();
|
|
||||||
let ilias = Arc::new(ilias);
|
let ilias = Arc::new(ilias);
|
||||||
let desktop = ilias.personal_desktop().await.unwrap();
|
let desktop = ilias.personal_desktop().await.unwrap();
|
||||||
for item in desktop.items {
|
for item in desktop.items {
|
||||||
@ -385,6 +474,8 @@ async fn main() {
|
|||||||
while *TASKS_QUEUED.lock() > 0 {
|
while *TASKS_QUEUED.lock() > 0 {
|
||||||
tokio::time::delay_for(Duration::from_millis(500)).await;
|
tokio::time::delay_for(Duration::from_millis(500)).await;
|
||||||
}
|
}
|
||||||
|
// restore fast page loading times
|
||||||
|
let _ = ilias.client.get("https://ilias.studium.kit.edu/ilias.php?baseClass=ilRepositoryGUI&cmd=frameset&set_mode=flat&ref_id=1").send().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
lazy_static!{
|
lazy_static!{
|
||||||
@ -411,6 +502,7 @@ fn process_gracefully(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std
|
|||||||
#[allow(non_upper_case_globals)]
|
#[allow(non_upper_case_globals)]
|
||||||
mod selectors {
|
mod selectors {
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
|
use regex::Regex;
|
||||||
use scraper::Selector;
|
use scraper::Selector;
|
||||||
// construct CSS selectors once
|
// construct CSS selectors once
|
||||||
lazy_static!{
|
lazy_static!{
|
||||||
@ -427,6 +519,9 @@ mod selectors {
|
|||||||
pub static ref span_small: Selector = Selector::parse("span.small").unwrap();
|
pub static ref span_small: Selector = Selector::parse("span.small").unwrap();
|
||||||
pub static ref forum_pages: Selector = Selector::parse("div.ilTableNav > table > tbody > tr > td > a").unwrap();
|
pub static ref forum_pages: Selector = Selector::parse("div.ilTableNav > table > tbody > tr > td > a").unwrap();
|
||||||
pub static ref alert_danger: Selector = Selector::parse("div.alert-danger").unwrap();
|
pub static ref alert_danger: Selector = Selector::parse("div.alert-danger").unwrap();
|
||||||
|
pub static ref tree_highlighted: Selector = Selector::parse("span.ilHighlighted").unwrap();
|
||||||
|
|
||||||
|
pub static ref cmd_node_regex: Regex = Regex::new(r#"cmdNode=uf:\w\w"#).unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
use crate::selectors::*;
|
use crate::selectors::*;
|
||||||
@ -435,7 +530,7 @@ use crate::selectors::*;
|
|||||||
//async fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) {
|
//async fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) {
|
||||||
fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::Future<Output = Result<()>> + Send { async move {
|
fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::Future<Output = Result<()>> + Send { async move {
|
||||||
if ilias.opt.verbose > 0 {
|
if ilias.opt.verbose > 0 {
|
||||||
println!("Syncing {} {}..", obj.kind(), path.strip_prefix(&ilias.opt.output).unwrap().to_string_lossy());
|
println!("Syncing {} {}.. {}", obj.kind(), path.strip_prefix(&ilias.opt.output).unwrap().to_string_lossy(), obj.url().url);
|
||||||
}
|
}
|
||||||
match &obj {
|
match &obj {
|
||||||
Course { url, name } => {
|
Course { url, name } => {
|
||||||
@ -444,14 +539,19 @@ fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::F
|
|||||||
Err(e)?;
|
Err(e)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let content_tree = ilias.get_course_content_tree(&url.ref_id).await;
|
let html = ilias.download(&url.url).await?.text().await?;
|
||||||
|
let cmd_node = cmd_node_regex.find(&html).ok_or::<Error>("can't find cmdNode".into())?.as_str()[8..].to_owned();
|
||||||
|
let content_tree = ilias.get_course_content_tree(&url.ref_id, &cmd_node).await;
|
||||||
let content = match content_tree {
|
let content = match content_tree {
|
||||||
Ok(tree) => tree,
|
Ok(tree) => tree,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// some folders are hidden and can only be found via the RSS feed / recent activity / content tree sidebar
|
// some folders are hidden on the course page and can only be found via the RSS feed / recent activity / content tree sidebar
|
||||||
// TODO: this is never the case for folders?
|
// TODO: this is probably never the case for folders?
|
||||||
|
if html.contains(r#"input[name="cmd[join]""#) {
|
||||||
|
return Ok(()); // ignore groups we are not in
|
||||||
|
}
|
||||||
println!("Warning: {:?} falling back to incomplete course content extractor! {}", name, e.display_chain());
|
println!("Warning: {:?} falling back to incomplete course content extractor! {}", name, e.display_chain());
|
||||||
ilias.get_course_content(&url).await?
|
ilias.get_course_content(&url).await? // TODO: perhaps don't download almost the same content 3x
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
for item in content {
|
for item in content {
|
||||||
@ -530,7 +630,7 @@ fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::F
|
|||||||
println!("Found video: {}", title);
|
println!("Found video: {}", title);
|
||||||
}
|
}
|
||||||
let video = Video {
|
let video = Video {
|
||||||
url: link.value().attr("href").unwrap().to_owned()
|
url: URL::raw(link.value().attr("href").unwrap().to_owned())
|
||||||
};
|
};
|
||||||
let ilias = Arc::clone(&ilias);
|
let ilias = Arc::clone(&ilias);
|
||||||
task::spawn(async {
|
task::spawn(async {
|
||||||
@ -553,7 +653,7 @@ fn process(ilias: Arc<ILIAS>, path: PathBuf, obj: Object) -> impl std::future::F
|
|||||||
}
|
}
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
let url = format!("{}{}", ILIAS_URL, url);
|
let url = format!("{}{}", ILIAS_URL, url.url);
|
||||||
let data = ilias.download(&url);
|
let data = ilias.download(&url);
|
||||||
let html = data.await?.text().await?;
|
let html = data.await?.text().await?;
|
||||||
if ilias.opt.verbose > 1 {
|
if ilias.opt.verbose > 1 {
|
||||||
|
Loading…
Reference in New Issue
Block a user