Intro
This time, I will try reading a word processing file(MS Word file) by OpenXML.
I will use MS Word template to check the result.
Environments
- .NET ver.9.0.100
- DocumentFormat.OpenXml ver.3.1.1
- NLog.Web.AspNetCore ver.5.3.14
- Microsoft.AspNetCore.SpaServices.Extensions ver.9.0.0
Reading
I can get stream from IFormFile what is sent from the client-side and read as "WordprocessingDocument".
DocFileReader.cs
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
namespace OfficeFileAccessor.OfficeFiles.Readers;
public class DocFileReader: IOfficeFileReader
{
private readonly NLog.Logger logger;
public DocFileReader()
{
this.logger = NLog.LogManager.GetCurrentClassLogger();
}
public void Read(IFormFile file)
{
// Open file stream and read
using WordprocessingDocument wordDoc = WordprocessingDocument.Open(file.OpenReadStream(), false);
Body? body = wordDoc.MainDocumentPart?.Document?.Body;
if(body == null)
{
logger.Warn("Failed reading the document");
return;
}
// Get all elements as XML from body
foreach(OpenXmlElement elm in body.Elements())
{
logger.Info($"Type: {elm.GetType()} XML: {elm.InnerXml}");
}
}
}
Result
Type: DocumentFormat.OpenXml.Wordprocessing.Paragraph XML: <w:pPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:pStyle w:val="a6" /><w:jc w:val="left" /></w:pPr><w:r xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:rPr><w:noProof /></w:rPr><mc:AlternateContent xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"><mc:Choice Requires="wps"><w:drawing><wp:anchor distT="0" distB="0" distL="114300" distR="114300" simplePos="0" relativeHeight="251659264" behindDoc="0" locked="0" layoutInCell="1" allowOverlap="1" wp14:editId="2621265D" wp14:anchorId="3EB8B806" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"><wp:simplePos x="0" y="0" /><wp:positionH relativeFrom="column"><wp:posOffset>485140</wp:posOffset></wp:positionH><wp:positionV relativeFrom="paragraph"><wp:posOffset>329565</wp:posOffset></wp:positionV><wp:extent cx="1495425" cy="604837" /><wp:effectExtent l="0" t="0" r="28575" b="24130" /><wp:wrapNone />
...
I also can get inner text and style information for each element type.
Table
DocFileReader.cs
...
public void Read(IFormFile file)
{
...
foreach(OpenXmlElement elm in body.Elements())
{
if(elm is Table table)
{
logger.Info("Table found:");
ReadTableProps(table);
}
}
}
private void ReadTableProps(Table table)
{
// Get Table properties
TableProperties? tableProperties = table.GetFirstChild<TableProperties>();
if(tableProperties != null)
{
// Table width
TableWidth? tableWidth = tableProperties.GetFirstChild<TableWidth>();
logger.Info($"Table Width: {tableWidth?.Width}");
// Table borders
TableBorders? borders = tableProperties.GetFirstChild<TableBorders>();
if(borders != null)
{
logger.Info($"Table Border Left Val: {borders.LeftBorder?.Val} Color: {borders.LeftBorder?.Color} Size: {borders.LeftBorder?.Size}");
logger.Info($"Table Border Top Val: {borders.TopBorder?.Val} Color: {borders.TopBorder?.Color} Size: {borders.TopBorder?.Size}");
}
}
// Get rows
var rows = table.Elements<TableRow>();
foreach (var row in rows)
{
// Get row properties
TableRowProperties? rowProperties = row.GetFirstChild<TableRowProperties>();
if(rowProperties != null)
{
TableRowHeight? rowHeight = rowProperties.GetFirstChild<TableRowHeight>();
logger.Info($"Row Height: {rowHeight?.Val}");
}
// Get cells
var cells = row.Elements<TableCell>();
foreach (var cell in cells)
{
// Get cell texts
string cellText = cell.InnerText;
logger.Info($"CELL Text: {cellText}");
// Get cell properties
TableCellProperties? cellProperties = cell.GetFirstChild<TableCellProperties>();
if(cellProperties != null)
{
TableCellWidth? cellWidth = cellProperties.GetFirstChild<TableCellWidth>();
logger.Info($"Cell Width: {cellWidth?.Width}");
TableCellBorders? borders = cellProperties.GetFirstChild<TableCellBorders>();
if(borders != null)
{
logger.Info($"Cell Border Right Val: {borders.RightBorder?.Val} Color: {borders.RightBorder?.Color} Size: {borders.RightBorder?.Size}");
logger.Info($"Cell Border Bottom Val: {borders.BottomBorder?.Val} Color: {borders.BottomBorder?.Color} Size: {borders.BottomBorder?.Size}");
}
// Get colors
Shading? shading = cellProperties.GetFirstChild<Shading>();
if(shading != null)
{
logger.Info($"Cell BackgroundColor: {shading.Fill?.Value} Color:{shading.Color}");
}
}
}
logger.Info("-----------");
}
logger.Info("\n");
}
}
Top comments (0)