Intro
This time, I will try reading a spreadsheet file(MS Excel file) by DocumentFormat.OpenXml.
I will use this file to check the result.
[ASP.NET Core] Try reading a word processing file by OpenXML 1
[ASP.NET Core] Try reading a word processing file by OpenXML 2
Getting cells
To load a spreadsheet using OpenXML, I will get a WorkbookPart from the spreadsheet file first.
And then I will get a WorksheetPart from it, finally I will be able to get Worksheets from the WorksheetPart.
XlsFileReader.cs
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
using DocumentFormat.OpenXml.Drawing.Spreadsheet;
using Drawing = DocumentFormat.OpenXml.Drawing;
using OfficeFileAccessor.Apps;
using System.Text.RegularExpressions;
namespace OfficeFileAccessor.OfficeFiles.Readers;
public class XlsFileReader(ILogger<XlsFileReader> Logger) : IXlsFileReader
{
...
public void Read(IFormFile file)
{
using SpreadsheetDocument spreadsheet = SpreadsheetDocument.Open(file.OpenReadStream(), false);
WorkbookPart? bookPart = spreadsheet.WorkbookPart;
if(bookPart == null) {
return;
}
List<string> sheetNames = GetSheetNameList(bookPart);
foreach(var name in sheetNames)
{
Worksheet? targetSheet = GetWorksheet(bookPart, name);
if(targetSheet == null) {
return;
}
WorksheetPart? sheetPart = targetSheet.WorksheetPart;
foreach(Row row in targetSheet.Descendants<Row>())
{
foreach(Cell cell in row.Cast<Cell>())
{
// Get cell values
Worksheets.Cell? cellValue = GetCellValue(bookPart, cell);
}
}
// Load first sheet for testing.
break;
}
}
...
private static List<string> GetSheetNameList(WorkbookPart bookPart) =>
[.. bookPart.Workbook.Descendants<Sheet>().Where(s => string.IsNullOrEmpty(s.Name) == false).Select(s => s.Name?.Value ?? "")];
private static Worksheet? GetWorksheet(WorkbookPart bookPart, string sheetName)
{
foreach(Sheet s in bookPart.Workbook.Descendants<Sheet>())
{
if(s.Name == sheetName && string.IsNullOrEmpty(s.Id) == false)
{
if(bookPart.TryGetPartById(s.Id!, out var part))
{
if (part is WorksheetPart result)
{
return result.Worksheet;
}
}
}
}
return null;
}
...
}
Getting cell values
After getting cell instances, I can get values from them.
Text
Cell.cs
namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public class Cell
{
public required string Address { get; init; }
public required CellValueType Type { get; init; }
public required string Value { get; init; }
public string? Formula { get; init; }
...
}
XlsFileReader.cs
...
private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
{
// Get value
string value = cell.InnerText;
// if the data type is SharedString, find the value from Shared String Table
if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
{
SharedStringTablePart? sharedStringTablePart = bookPart.GetPartsOfType<SharedStringTablePart>()
?.FirstOrDefault();
if (sharedStringTablePart != null)
{
OpenXmlElement sharedStringItem = sharedStringTablePart.SharedStringTable
.ElementAt(int.Parse(value));
// Concatenate all text except phonetic reading
string result = string.Concat(
sharedStringItem.Descendants<DocumentFormat.OpenXml.Spreadsheet.Text>()
.Where(t => CheckIsPhonetic(t) == false)
.Select(t => t.Text)
);
return new Worksheets.Cell
{
Address = cell.CellReference?.Value ?? "A1",
Type = Worksheets.CellValueType.Text,
Value = result,
};
}
}
Worksheets.CellValueType valueType = Worksheets.CellValueType.Text;
return new Worksheets.Cell
{
Address = cell.CellReference?.Value ?? "A1",
Type = valueType,
Value = value,
};
}
/// <summary>
/// Check if the parent element is "PhoneticRun"
/// </summary>
/// <param name="textElement"></param>
/// <returns></returns>
private static bool CheckIsPhonetic(DocumentFormat.OpenXml.Spreadsheet.Text textElement)
{
return textElement.Ancestors<PhoneticRun>().Any();
}
...
SharedStringTablePart
When I get the cell value from the SharedStringTablePart, if I get it from InnerText, the phonetic will also be combined.
example
- Cell value: 建築物に関する事項(集約版)(参考様式)
- InnerText: 建築物に関する事項(集約版)(参考様式)ケンチクブツシュウヤクバンサンコウヨウシキ
So I should filter the values.
XlsFileReader.cs
...
private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
{
...
OpenXmlElement sharedStringItem = sharedStringTablePart.SharedStringTable
.ElementAt(int.Parse(value));
// Concatenate all text except phonetic reading
string result = string.Concat(
sharedStringItem.Descendants<DocumentFormat.OpenXml.Spreadsheet.Text>()
.Where(t => CheckIsPhonetic(t) == false)
.Select(t => t.Text)
);
...
}
...
private static bool CheckIsPhonetic(DocumentFormat.OpenXml.Spreadsheet.Text textElement)
{
return textElement.Ancestors<PhoneticRun>().Any();
}
...
Double
I can get numeric values as same as getting texts.
But some values may not be correct if they have decimal points.
XlsFileReader.cs
...
Worksheets.CellValueType valueType = Worksheets.CellValueType.Text;
if (string.IsNullOrEmpty(value) == false &&
double.TryParse(value, out double nv))
{
valueType = Worksheets.CellValueType.Double;
value = nv.ToString("G");
}
return new Worksheets.Cell
{
Address = cell.CellReference?.Value ?? "A1",
Type = valueType,
Value = value,
};
...
Formula
XlsFileReader.cs
...
// Formula
string? formula = cell.CellFormula?.Text;
// Get calculation result
string? calcResult = cell.CellValue?.InnerText;
if(string.IsNullOrEmpty(formula) == false && string.IsNullOrEmpty(calcResult) == false)
{
if (double.TryParse(calcResult, out double n))
{
calcResult = n.ToString("G");
}
return new Worksheets.Cell
{
Address = cell.CellReference?.Value ?? "A1",
Type = Worksheets.CellValueType.Formula,
Value = calcResult,
Formula = formula,
};
}
...
Getting cell styles
Border
Get cell borders from WorkbookStylesPart.
Cell.cs
namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public class Cell
{
public required string Address { get; init; }
public required CellValueType Type { get; init; }
public required string Value { get; init; }
public string? Formula { get; init; }
public required double Width { get; init; }
public required double Height { get; init; }
public string? BackgroundColor { get; init; }
public required CellBorders Borders { get; init; }
...
}
CellBorders.cs
namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public record CellBorders
{
public BorderType Left { get; init; }
public BorderType Top { get; init; }
public BorderType Right { get; init; }
public BorderType Bottom { get; init; }
...
public static CellBorders GetNoBorders()
{
return new ()
{
Left = BorderType.None,
Top = BorderType.None,
Right = BorderType.None,
Bottom = BorderType.None,
};
}
}
BorderType.cs
namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public enum BorderType
{
None = 0,
Thin,
}
public static class BorderTypeFactory
{
public static BorderType Get(string? borderValue)
{
if(string.IsNullOrEmpty(borderValue))
{
return BorderType.None;
}
return borderValue.ToLower() switch
{
"thin" => BorderType.Thin,
_ => BorderType.None,
};
}
}
XlsFileReader.cs
...
private Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell, double width, double height)
{
// Borders
Worksheets.CellBorders borders = GetBorders(bookPart, cell);
...
return new Worksheets.Cell
{
Address = cell.CellReference?.Value ?? "A1",
Type = valueType,
Value = value,
Borders = borders,
};
}
...
private static Worksheets.CellBorders GetBorders(WorkbookPart bookPart, Cell cell)
{
if(cell.StyleIndex?.Value == null)
{
return Worksheets.CellBorders.GetNoBorders();
}
CellFormat? cellFormat = bookPart.WorkbookStylesPart?.Stylesheet?.CellFormats?.ElementAt((int)cell.StyleIndex.Value) as CellFormat;
if(cellFormat?.BorderId?.Value != null)
{
Border? border = bookPart.WorkbookStylesPart?.Stylesheet?.Borders?.ElementAt(
(int)cellFormat.BorderId.Value) as Border;
if(border != null)
{
return new ()
{
Left = Worksheets.BorderTypeFactory.Get(border?.LeftBorder?.Style?.InnerText),
Top = Worksheets.BorderTypeFactory.Get(border?.TopBorder?.Style?.InnerText),
Right = Worksheets.BorderTypeFactory.Get(border?.RightBorder?.Style?.InnerText),
Bottom = Worksheets.BorderTypeFactory.Get(border?.BottomBorder?.Style?.InnerText),
};
}
}
return Worksheets.CellBorders.GetNoBorders();
}
...
Background color
XlsFileReader.cs
...
private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
{
// Cell color
string? backgroundColor = GetCellColor(cell, bookPart);
...
return new Worksheets.Cell
{
Address = cell.CellReference?.Value ?? "A1",
Type = valueType,
Value = value,
BackgroundColor = backgroundColor,
Borders = borders,
};
}
...
private static string? GetCellColor(Cell cell, WorkbookPart bookPart)
{
uint? styleIndex = cell.StyleIndex?.Value;
if(styleIndex == null)
{
return null;
}
CellFormat? cellFormat = bookPart.WorkbookStylesPart?.Stylesheet?.CellFormats?.ElementAt((int)styleIndex) as CellFormat;
if (cellFormat?.FillId != null)
{
Fill? fill = bookPart.WorkbookStylesPart?.Stylesheet?.Fills?.ElementAt((int)cellFormat.FillId.Value) as Fill;
PatternFill? patternFill = fill?.PatternFill;
string? rgbColor = GetRgbColor(patternFill?.ForegroundColor?.Rgb);
if(string.IsNullOrEmpty(rgbColor) == false)
{
return rgbColor;
}
string? themeColor = GetThemeColor(bookPart, patternFill?.ForegroundColor?.Theme?.Value);
if(string.IsNullOrEmpty(themeColor) == false)
{
return themeColor;
}
}
return null;
}
private static string? GetRgbColor(HexBinaryValue? rgb)
{
if(rgb?.InnerText == null)
{
return null;
}
// Remove alpha value
return rgb.InnerText[2..];
}
private static string? GetThemeColor(WorkbookPart bookPart, uint? themeColorIndex)
{
if(themeColorIndex == null || themeColorIndex <= 0)
{
return null;
}
ThemePart? themePart = bookPart.ThemePart;
Drawing.Theme? theme = themePart?.Theme;
if(theme != null)
{
Drawing.Color2Type? color2Type = theme.ThemeElements?.ColorScheme?.ElementAt((int)themeColorIndex) as Drawing.Color2Type;
return color2Type?.RgbColorModelHex?.Val;
}
return null;
}
...
Width, Height
By default, cells don't have own widths and heights.
So I should get them from columns and rows.
PrintArea
To get column widths, I should get PrintArea first to get last column index.
PageArea.cs
namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public record PageArea(CellAddress Start, CellAddress End);
XlsFileReader.cs
...
private List<Worksheets.PageArea> GetPrintArea(WorkbookPart bookPart, Worksheet worksheet)
{
DefinedNames? definedNames = bookPart.Workbook.DefinedNames;
if(definedNames == null)
{
return [];
}
List<Worksheets.PageArea> results = [];
foreach (DefinedName definedName in definedNames.Elements<DefinedName>())
{
if(string.IsNullOrEmpty(definedName.Name?.Value))
{
continue;
}
if (definedName.Name.Value.StartsWith("_xlnm.Print_Area"))
{
string sheetName = "default sheet";
if(definedName.LocalSheetId != null)
{
Sheet? sheet = bookPart.Workbook.Sheets?.Elements<Sheet>()
?.FirstOrDefault(s => s.SheetId?.Value != null && s.SheetId.Value == definedName.LocalSheetId.Value + 1);
if(sheet?.Name != null)
{
sheetName = sheet.Name!;
}
}
string printAreaValue = definedName.Text;
// SheetName is like SheetName!$A$1:$Z$20
string[] ranges = printAreaValue.Split('!');
foreach(var r in ranges)
{
string[] addresses = r.Split(":");
if(addresses.Length < 2)
{
continue;
}
(string columnNameStart, int rowStart) = GetCellAddress(addresses[0]);
if(string.IsNullOrEmpty(columnNameStart) || rowStart <= 0)
{
continue;
}
Worksheets.CellAddress startAddress = new (columnNameStart, ConvertAlphabetToIndex(columnNameStart),
rowStart);
(string columnNameEnd, int rowEnd) = GetCellAddress(addresses[1]);
if(string.IsNullOrEmpty(columnNameEnd) || rowEnd <= 0)
{
continue;
}
Worksheets.CellAddress endAddress = new (columnNameEnd, ConvertAlphabetToIndex(columnNameEnd),
rowEnd);
results.Add(new (startAddress, endAddress));
}
}
}
return results;
}
...
Width, Height
XlsFileReader.cs
...
public class XlsFileReader(ILogger<XlsFileReader> Logger) : IXlsFileReader
{
private readonly double DefaultWidth = Numbers.ConvertFromPixelToCentimeter(8.38 * 7.0);
private readonly double DefaultHeight = Numbers.ConvertFromPointToCentimeter(18.75);
private static readonly Regex CellAddressRegex = new (@"\$([a-zA-Z]+)\$([0-9]+)");
private static readonly Regex ColumnNameRegex = new ("([a-zA-Z]+)");
public void Read(IFormFile file)
{
using SpreadsheetDocument spreadsheet = SpreadsheetDocument.Open(file.OpenReadStream(), false);
WorkbookPart? bookPart = spreadsheet.WorkbookPart;
List<string> sheetNames = GetSheetNameList(bookPart);
foreach(var name in sheetNames)
{
Worksheet? targetSheet = GetWorksheet(bookPart, name);
if(targetSheet == null)
{
return;
}
List<Worksheets.PageArea> printAreas = GetPrintArea(bookPart, targetSheet);
Worksheets.PageArea? target = printAreas.FirstOrDefault();
List<Worksheets.ColumnWidth> widths = [];
if(target != null)
{
widths = GetColumnWidths(targetSheet, target.Start.Column, target.End.Column);
}
foreach(Row row in targetSheet.Descendants<Row>())
{
double height = DefaultHeight;
if(row.Height?.Value != null)
{
// The row.Height value is expressed in points
// convert it to centimeters
height = Numbers.ConvertFromPointToCentimeter(row.Height.Value);
}
foreach(Cell cell in row.Cast<Cell>())
{
string columnName = GetColumnNameFromAddress(cell.CellReference);
double? width = widths.FirstOrDefault(w => w.ColumnName == columnName)?.Width;
width ??= DefaultWidth;
Worksheets.Cell? cellValue = GetCellValue(bookPart, cell, (double)width, height);
}
}
// Load first sheet for testing.
break;
}
}
private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell, double width, double height)
{
...
return new Worksheets.Cell
{
Address = cell.CellReference?.Value ?? "A1",
Type = valueType,
Value = value,
Width = width,
Height = height,
BackgroundColor = backgroundColor,
Borders = borders,
};
}
...
private List<Worksheets.ColumnWidth> GetColumnWidths(Worksheet sheet, int startColumn, int lastColumn)
{
Columns? columns = sheet.Descendants<Columns>().FirstOrDefault();
if (columns == null)
{
return [];
}
List<Worksheets.ColumnWidth> results = [];
for (int i = startColumn; i <= lastColumn; i++)
{
double columnWidth = DefaultWidth;
if (columns != null)
{
uint idx = (uint)i;
Column? column = columns.Elements<Column>().FirstOrDefault(c =>
c?.Min != null && c.Max != null && c.Min <= idx && c.Max >= idx);
if (column?.Width != null)
{
// The column.width value represents the number of characters (7 pixels by default)
// convert it to centimeters.
columnWidth = Numbers.ConvertFromPixelToCentimeter(column.Width * 7.0);
}
}
results.Add(new Worksheets.ColumnWidth(i, ConvertIndexToAlphabet(i), columnWidth));
}
return results;
}
...
Numbers.cs
namespace OfficeFileAccessor.Apps;
public static class Numbers
{
private const double EmuToCm = 1.0 / 360000.0;
private const double PointToCm = 0.0352778;
private const double PixelToCm = 0.0264583;
public static int ParseInt(string? value, int defaultValue)
{
if(string.IsNullOrEmpty(value))
{
return defaultValue;
}
if(int.TryParse(value, out var result))
{
return result;
}
return defaultValue;
}
public static double ConvertFromEMUToCentimeter(int emuValue)
{
return EmuToCm * (double)emuValue;
}
public static double ConvertFromPointToCentimeter(double pointValue)
{
return PointToCm * pointValue;
}
public static double ConvertFromPixelToCentimeter(double pixelValue)
{
return PixelToCm * pixelValue;
}
}
Top comments (0)