feat: Refactor powerpoint to be under io and create initial docx module (3ccc7979) · Commits · Research Enablement / ACORN

Cargo.lock

+37 −0

Original line number	Diff line number	Diff line
		@@ -74,6 +74,7 @@ dependencies = [
		"data-encoding",
		"derive_more",
		"directories",
		"docx-rs",
		"dotenvy",
		"duckdb",
		"exitcode",
		@@ -1954,6 +1955,20 @@ dependencies = [
		"litrs",
		]

		[[package]]
		name = "docx-rs"
		version = "0.4.20"
		source = "registry+https://github.com/rust-lang/crates.io-index"
		checksum = "ed73cbf5e1c37baa23f4132569ac1187829f03922c206bd68fe109e3001a343d"
		dependencies = [
		"base64 0.22.1",
		"quick-xml 0.36.2",
		"serde",
		"serde_json",
		"thiserror 2.0.18",
		"zip 0.6.6",
		]

		[[package]]
		name = "dotenvy"
		version = "0.15.7"
		@@ -5343,6 +5358,16 @@ dependencies = [
		"memchr",
		]

		[[package]]
		name = "quick-xml"
		version = "0.36.2"
		source = "registry+https://github.com/rust-lang/crates.io-index"
		checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe"
		dependencies = [
		"encoding_rs",
		"memchr",
		]

		[[package]]
		name = "quick-xml"
		version = "0.39.1"
		@@ -9223,6 +9248,18 @@ dependencies = [
		"time 0.1.45",
		]

		[[package]]
		name = "zip"
		version = "0.6.6"
		source = "registry+https://github.com/rust-lang/crates.io-index"
		checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
		dependencies = [
		"byteorder",
		"crc32fast",
		"crossbeam-utils",
		"flate2",
		]

		[[package]]
		name = "zip"
		version = "6.0.0"

Cargo.toml

+1 −0

Original line number	Diff line number	Diff line
		@@ -37,6 +37,7 @@ derive_more = { version = "2.1.1", features = [
		"from_str",
		], default-features = false }
		dotenvy = { version = "0.15.0", default-features = false }
		docx-rs = { version = "0.4.17", default-features = false }
		duckdb = { version = "1.10502.0", default-features = false }
		fancy-regex = { version = "0.17.0", default-features = false }
		futures = { version = "0.3.32", features = ["alloc"], default-features = false }

acorn-cli/src/commands/export/powerpoint.rs

+2 −2

Original line number	Diff line number	Diff line
		@@ -3,9 +3,9 @@
		//! This module generates PPTX output by copying slide templates, updating
		//! OOXML relationships, and interpolating Research Activity data.
		use crate::cli::CommandOptions;
		use acorn::io::powerpoint::ooxml::{Relationship, Relationships};
		use acorn::io::powerpoint::{interpolate_values, read_xml_rel};
		use acorn::io::{archive, extract_zip, read_file, write_file, ApiResult, InputOutput};
		use acorn::powerpoint::ooxml::{Relationship, Relationships};
		use acorn::powerpoint::{interpolate_values, read_xml_rel};
		use acorn::prelude::{copy, create_dir_all, exit, Path, PathBuf};
		use acorn::schema::research_activity::ResearchActivity;
		use acorn::util::{Label, StringConversion};

acorn-lib/Cargo.toml

+1 −0

Original line number	Diff line number	Diff line
		@@ -40,6 +40,7 @@ data-encoding = { workspace = true }
		derive_more = { workspace = true }
		directories = { version = "6.0.0", default-features = false }
		dotenvy = { workspace = true }
		docx-rs = { workspace = true }
		duckdb = { workspace = true, features = ["bundled"], default-features = false, optional = true }
		fancy-regex = { workspace = true }
		flate2 = { version = "1.1.5", default-features = false }

acorn-lib/src/io/docx/mod.rs

0 → 100644

+159 −0

Original line number	Diff line number	Diff line
		//! DOCX input utilities.
		use crate::io::ApiResult;
		use crate::prelude::{read, PathBuf};
		use color_eyre::eyre::eyre;
		use docx_rs::{
		read_docx, Delete, DeleteChild, DocumentChild, Hyperlink, Insert, InsertChild, Paragraph, ParagraphChild, Run, RunChild, Table, TableCell,
		TableCellContent, TableChild, TableRow, TableRowChild,
		};

		fn extract_from_table(table: &Table) -> String {
		table
		.rows
		.iter()
		.map(\|row\| match row {
		\| TableChild::TableRow(value) => extract_from_table_row(value),
		})
		.collect::<Vec<_>>()
		.join("\n")
		}
		fn extract_from_table_row(row: &TableRow) -> String {
		row.cells
		.iter()
		.map(\|cell\| match cell {
		\| TableRowChild::TableCell(value) => extract_from_table_cell(value),
		})
		.collect::<Vec<_>>()
		.join("\n")
		}
		fn extract_from_table_cell(cell: &TableCell) -> String {
		cell.children
		.iter()
		.map(\|content\| match content {
		\| TableCellContent::Paragraph(paragraph) => extract_from_paragraph(paragraph),
		\| TableCellContent::Table(table) => extract_from_table(table),
		\| TableCellContent::StructuredDataTag(_) \| TableCellContent::TableOfContents(_) => String::new(),
		})
		.collect::<Vec<_>>()
		.join("\n")
		}
		fn extract_from_paragraph(paragraph: &Paragraph) -> String {
		paragraph
		.children
		.iter()
		.map(\|child\| match child {
		\| ParagraphChild::Run(run) => extract_from_run(run),
		\| ParagraphChild::Hyperlink(hyperlink) => extract_from_hyperlink(hyperlink),
		\| ParagraphChild::Insert(insert) => extract_from_insert(insert),
		\| ParagraphChild::Delete(delete) => extract_from_delete(delete),
		\| ParagraphChild::PageNum(_) \| ParagraphChild::NumPages(_) => " ".to_string(),
		\| ParagraphChild::BookmarkStart(_)
		\| ParagraphChild::BookmarkEnd(_)
		\| ParagraphChild::CommentStart(_)
		\| ParagraphChild::CommentEnd(_)
		\| ParagraphChild::StructuredDataTag(_) => "".to_string(),
		})
		.collect::<String>()
		}
		fn extract_from_hyperlink(hyperlink: &Hyperlink) -> String {
		hyperlink
		.children
		.iter()
		.map(\|child\| match child {
		\| ParagraphChild::Run(run) => extract_from_run(run),
		\| ParagraphChild::Insert(insert) => extract_from_insert(insert),
		\| ParagraphChild::Delete(delete) => extract_from_delete(delete),
		\| ParagraphChild::Hyperlink(_) => "".to_string(),
		\| ParagraphChild::PageNum(_) \| ParagraphChild::NumPages(_) => " ".to_string(),
		\| ParagraphChild::BookmarkStart(_)
		\| ParagraphChild::BookmarkEnd(_)
		\| ParagraphChild::CommentStart(_)
		\| ParagraphChild::CommentEnd(_)
		\| ParagraphChild::StructuredDataTag(_) => "".to_string(),
		})
		.collect::<String>()
		}
		fn extract_from_insert(insert: &Insert) -> String {
		insert
		.children
		.iter()
		.map(\|child\| match child {
		\| InsertChild::Run(run) => extract_from_run(run),
		\| InsertChild::Delete(delete) => extract_from_delete(delete),
		\| InsertChild::CommentStart(_) \| InsertChild::CommentEnd(_) => "".to_string(),
		})
		.collect::<String>()
		}
		fn extract_from_delete(delete: &Delete) -> String {
		delete
		.children
		.iter()
		.map(\|child\| match child {
		\| DeleteChild::Run(run) => extract_from_run(run),
		\| DeleteChild::CommentStart(_) \| DeleteChild::CommentEnd(_) => "".to_string(),
		})
		.collect::<String>()
		}
		fn extract_from_run(run: &Run) -> String {
		run.children
		.iter()
		.map(\|child\| match child {
		\| RunChild::Text(text) => text.text.clone(),
		\| RunChild::InstrTextString(text) => text.clone(),
		\| RunChild::Tab(_) => "\t".to_string(),
		\| RunChild::Break(_) => "\n".to_string(),
		\| RunChild::Sym(_)
		\| RunChild::DeleteText(_)
		\| RunChild::PTab(_)
		\| RunChild::Drawing(_)
		\| RunChild::Shape(_)
		\| RunChild::CommentStart(_)
		\| RunChild::CommentEnd(_)
		\| RunChild::FieldChar(_)
		\| RunChild::InstrText(_)
		\| RunChild::DeleteInstrText(_)
		\| RunChild::FootnoteReference(_)
		\| RunChild::Shading(_) => "".to_string(),
		})
		.collect::<String>()
		}
		/// Extract visible text from a DOCX file.
		///
		/// The extracted output is paragraph-oriented and joined with newline separators.
		pub fn extract_text_from_path<P>(path: P) -> ApiResult<String>
		where
		P: Into<PathBuf>,
		{
		let path = path.into();
		let bytes = read(path.clone()).map_err(\|why\| eyre!("Failed to read DOCX file {} - {why}", path.display()))?;
		let docx = read_docx(&bytes).map_err(\|why\| eyre!("Failed to parse DOCX file {} - {why}", path.display()))?;
		let text = docx
		.document
		.children
		.iter()
		.filter_map(\|child\| match child {
		\| DocumentChild::Paragraph(p) => {
		let text = extract_from_paragraph(p).trim().to_string();
		if text.is_empty() {
		None
		} else {
		Some(text)
		}
		}
		\| DocumentChild::Table(t) => {
		let text = extract_from_table(t).trim().to_string();
		if text.is_empty() {
		None
		} else {
		Some(text)
		}
		}
		\| _ => None,
		})
		.collect::<Vec<_>>()
		.join("\n");
		Ok(text)
		}

		#[cfg(test)]
		mod tests;