DEV Community

Masui Masanori
Masui Masanori

Posted on

[ASP.NET Core] Try reading a spreadsheet file by OpenXML 1

Intro

This time, I will try reading a spreadsheet file(MS Excel file) by DocumentFormat.OpenXml.
I will use this file to check the result.

Image description

Getting cells

To load a spreadsheet using OpenXML, I will get a WorkbookPart from the spreadsheet file first.
And then I will get a WorksheetPart from it, finally I will be able to get Worksheets from the WorksheetPart.

XlsFileReader.cs

using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
using DocumentFormat.OpenXml.Drawing.Spreadsheet;
using Drawing = DocumentFormat.OpenXml.Drawing;
using OfficeFileAccessor.Apps;
using System.Text.RegularExpressions;

namespace OfficeFileAccessor.OfficeFiles.Readers;

public class XlsFileReader(ILogger<XlsFileReader> Logger) : IXlsFileReader
{
...
    public void Read(IFormFile file)
    {
        using SpreadsheetDocument spreadsheet = SpreadsheetDocument.Open(file.OpenReadStream(), false);
        WorkbookPart? bookPart = spreadsheet.WorkbookPart;
        if(bookPart == null) {
            return;
        }
        List<string> sheetNames = GetSheetNameList(bookPart);
        foreach(var name in sheetNames)
        {
            Worksheet? targetSheet = GetWorksheet(bookPart, name);
            if(targetSheet == null) {
                return;
            }
            WorksheetPart? sheetPart = targetSheet.WorksheetPart;
            foreach(Row row in targetSheet.Descendants<Row>())
            {
                foreach(Cell cell in row.Cast<Cell>())
                {
                    // Get cell values
                    Worksheets.Cell? cellValue = GetCellValue(bookPart, cell);                    
                }
            }
            // Load first sheet for testing.
            break;
        }
    }
...
    private static List<string> GetSheetNameList(WorkbookPart bookPart) =>
        [.. bookPart.Workbook.Descendants<Sheet>().Where(s => string.IsNullOrEmpty(s.Name) == false).Select(s => s.Name?.Value ?? "")];
    private static Worksheet? GetWorksheet(WorkbookPart bookPart, string sheetName)
    {
        foreach(Sheet s in bookPart.Workbook.Descendants<Sheet>())
        {
            if(s.Name == sheetName && string.IsNullOrEmpty(s.Id) == false)
            {
                if(bookPart.TryGetPartById(s.Id!, out var part))
                {
                    if (part is WorksheetPart result)
                    {
                        return result.Worksheet;
                    }
                }
            }
        }
        return null;
    }
...
}
Enter fullscreen mode Exit fullscreen mode

Getting cell values

After getting cell instances, I can get values from them.

Text

Cell.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public class Cell
{
    public required string Address { get; init; }
    public required CellValueType Type { get; init; }
    public required string Value { get; init; }
    public string? Formula { get; init; }
...
}
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

...
    private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
    {    
        // Get value
        string value = cell.InnerText;
        // if the data type is SharedString, find the value from Shared String Table
        if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
        {
            SharedStringTablePart? sharedStringTablePart = bookPart.GetPartsOfType<SharedStringTablePart>()
                ?.FirstOrDefault();
            if (sharedStringTablePart != null)
            {
                OpenXmlElement sharedStringItem = sharedStringTablePart.SharedStringTable
                    .ElementAt(int.Parse(value));
                // Concatenate all text except phonetic reading
                string result = string.Concat(
                    sharedStringItem.Descendants<DocumentFormat.OpenXml.Spreadsheet.Text>()
                                    .Where(t => CheckIsPhonetic(t) == false)
                                    .Select(t => t.Text)
                );
                return new Worksheets.Cell
                {
                    Address = cell.CellReference?.Value ?? "A1",
                    Type = Worksheets.CellValueType.Text,
                    Value = result,
                };
            }
        }
        Worksheets.CellValueType valueType = Worksheets.CellValueType.Text;
        return new Worksheets.Cell
        {
            Address = cell.CellReference?.Value ?? "A1",
            Type = valueType,
            Value = value, 
        };
    }
    /// <summary>
    /// Check if the parent element is "PhoneticRun"
    /// </summary>
    /// <param name="textElement"></param>
    /// <returns></returns>
    private static bool CheckIsPhonetic(DocumentFormat.OpenXml.Spreadsheet.Text textElement)
    {
        return textElement.Ancestors<PhoneticRun>().Any();
    }
...
Enter fullscreen mode Exit fullscreen mode

SharedStringTablePart

When I get the cell value from the SharedStringTablePart, if I get it from InnerText, the phonetic will also be combined.

example
  • Cell value: 建築物に関する事項(集約版)(参考様式)
  • InnerText: 建築物に関する事項(集約版)(参考様式)ケンチクブツシュウヤクバンサンコウヨウシキ

So I should filter the values.

XlsFileReader.cs

...
    private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
    {
...
                OpenXmlElement sharedStringItem = sharedStringTablePart.SharedStringTable
                    .ElementAt(int.Parse(value));
                // Concatenate all text except phonetic reading
                string result = string.Concat(
                    sharedStringItem.Descendants<DocumentFormat.OpenXml.Spreadsheet.Text>()
                                    .Where(t => CheckIsPhonetic(t) == false)
                                    .Select(t => t.Text)
                );
...
    }
...
    private static bool CheckIsPhonetic(DocumentFormat.OpenXml.Spreadsheet.Text textElement)
    {
        return textElement.Ancestors<PhoneticRun>().Any();
    }
...
Enter fullscreen mode Exit fullscreen mode

Double

I can get numeric values as same as getting texts.
But some values may not be correct if they have decimal points.

XlsFileReader.cs

...
        Worksheets.CellValueType valueType = Worksheets.CellValueType.Text;
        if (string.IsNullOrEmpty(value) == false &&
            double.TryParse(value, out double nv))
        {
            valueType = Worksheets.CellValueType.Double;
            value = nv.ToString("G");
        }
        return new Worksheets.Cell
        {
            Address = cell.CellReference?.Value ?? "A1",
            Type = valueType,
            Value = value, 
        };
...
Enter fullscreen mode Exit fullscreen mode

Formula

XlsFileReader.cs

...
        // Formula
        string? formula = cell.CellFormula?.Text;
        // Get calculation result
        string? calcResult = cell.CellValue?.InnerText;

        if(string.IsNullOrEmpty(formula) == false && string.IsNullOrEmpty(calcResult) == false)
        {
            if (double.TryParse(calcResult, out double n))
            {
                calcResult = n.ToString("G");
            }
            return new Worksheets.Cell
            {
                Address = cell.CellReference?.Value ?? "A1",
                Type = Worksheets.CellValueType.Formula,
                Value = calcResult,
                Formula = formula,
            };
        }
...
Enter fullscreen mode Exit fullscreen mode

Getting cell styles

Border

Get cell borders from WorkbookStylesPart.

Cell.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public class Cell
{
    public required string Address { get; init; }
    public required CellValueType Type { get; init; }
    public required string Value { get; init; }
    public string? Formula { get; init; }
    public required double Width { get; init; }
    public required double Height { get; init; }
    public string? BackgroundColor { get; init; }
    public required CellBorders Borders { get; init; }
...
}
Enter fullscreen mode Exit fullscreen mode

CellBorders.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;

public record CellBorders
{
    public BorderType Left { get; init; }
    public BorderType Top { get; init; }
    public BorderType Right { get; init; }
    public BorderType Bottom { get; init; }
...
    public static CellBorders GetNoBorders()
    {
        return new ()
        {
            Left = BorderType.None,
            Top = BorderType.None,
            Right = BorderType.None,
            Bottom = BorderType.None,
        };
    }
}
Enter fullscreen mode Exit fullscreen mode

BorderType.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public enum BorderType
{
    None = 0,
    Thin,
}
public static class BorderTypeFactory
{
    public static BorderType Get(string? borderValue)
    {
        if(string.IsNullOrEmpty(borderValue))
        {
            return BorderType.None;
        }
        return borderValue.ToLower() switch
        {
            "thin" => BorderType.Thin,
            _ => BorderType.None,
        };
    }
}
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

...
    private Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell, double width, double height)
    {
        // Borders
        Worksheets.CellBorders borders = GetBorders(bookPart, cell);
...
        return new Worksheets.Cell
        {
            Address = cell.CellReference?.Value ?? "A1",
            Type = valueType,
            Value = value,
            Borders = borders,
        };
    }
...
    private static Worksheets.CellBorders GetBorders(WorkbookPart bookPart, Cell cell)
    {
        if(cell.StyleIndex?.Value == null)
        {
            return Worksheets.CellBorders.GetNoBorders();
        }
        CellFormat? cellFormat = bookPart.WorkbookStylesPart?.Stylesheet?.CellFormats?.ElementAt((int)cell.StyleIndex.Value) as CellFormat;
        if(cellFormat?.BorderId?.Value != null)
        {
            Border? border = bookPart.WorkbookStylesPart?.Stylesheet?.Borders?.ElementAt(
                    (int)cellFormat.BorderId.Value) as Border;
            if(border != null)
            {
                return new ()
                {
                    Left = Worksheets.BorderTypeFactory.Get(border?.LeftBorder?.Style?.InnerText),
                    Top = Worksheets.BorderTypeFactory.Get(border?.TopBorder?.Style?.InnerText),
                    Right = Worksheets.BorderTypeFactory.Get(border?.RightBorder?.Style?.InnerText),
                    Bottom = Worksheets.BorderTypeFactory.Get(border?.BottomBorder?.Style?.InnerText),
                };
            }            
        }
        return Worksheets.CellBorders.GetNoBorders();
    }
...
Enter fullscreen mode Exit fullscreen mode

Background color

XlsFileReader.cs

...
    private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
    {
        // Cell color
        string? backgroundColor = GetCellColor(cell, bookPart);
...
        return new Worksheets.Cell
        {
            Address = cell.CellReference?.Value ?? "A1",
            Type = valueType,
            Value = value,
            BackgroundColor = backgroundColor,
            Borders = borders,
        };
    }
...
    private static string? GetCellColor(Cell cell, WorkbookPart bookPart)
    {
        uint? styleIndex = cell.StyleIndex?.Value;
        if(styleIndex == null)
        {
            return null;
        }
        CellFormat? cellFormat = bookPart.WorkbookStylesPart?.Stylesheet?.CellFormats?.ElementAt((int)styleIndex) as CellFormat;
        if (cellFormat?.FillId != null)
        {
            Fill? fill = bookPart.WorkbookStylesPart?.Stylesheet?.Fills?.ElementAt((int)cellFormat.FillId.Value) as Fill;
            PatternFill? patternFill = fill?.PatternFill;
            string? rgbColor = GetRgbColor(patternFill?.ForegroundColor?.Rgb);
            if(string.IsNullOrEmpty(rgbColor) == false)
            {
                return rgbColor;
            }
            string? themeColor = GetThemeColor(bookPart, patternFill?.ForegroundColor?.Theme?.Value);
            if(string.IsNullOrEmpty(themeColor) == false)
            {
                return themeColor;
            }
        }    
        return null;
    }
    private static string? GetRgbColor(HexBinaryValue? rgb)
    {
        if(rgb?.InnerText == null)
        {
            return null;
        }
        // Remove alpha value
        return rgb.InnerText[2..];
    }
    private static string? GetThemeColor(WorkbookPart bookPart, uint? themeColorIndex)
    {
        if(themeColorIndex == null || themeColorIndex <= 0)
        {
            return null;
        }
        ThemePart? themePart = bookPart.ThemePart;
        Drawing.Theme? theme = themePart?.Theme;
        if(theme != null)
        {
            Drawing.Color2Type? color2Type = theme.ThemeElements?.ColorScheme?.ElementAt((int)themeColorIndex) as Drawing.Color2Type;
            return color2Type?.RgbColorModelHex?.Val;
        }
        return null;
    }
...
Enter fullscreen mode Exit fullscreen mode

Width, Height

By default, cells don't have own widths and heights.
So I should get them from columns and rows.

PrintArea

To get column widths, I should get PrintArea first to get last column index.

PageArea.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public record PageArea(CellAddress Start, CellAddress End);
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

...
    private List<Worksheets.PageArea> GetPrintArea(WorkbookPart bookPart, Worksheet worksheet)
    {
        DefinedNames? definedNames = bookPart.Workbook.DefinedNames;
        if(definedNames == null)
        {
            return [];
        }
        List<Worksheets.PageArea> results = [];
        foreach (DefinedName definedName in definedNames.Elements<DefinedName>())
        {
            if(string.IsNullOrEmpty(definedName.Name?.Value))
            {
                continue;
            }
            if (definedName.Name.Value.StartsWith("_xlnm.Print_Area"))
            {
                string sheetName = "default sheet";
                if(definedName.LocalSheetId != null)
                {
                    Sheet? sheet = bookPart.Workbook.Sheets?.Elements<Sheet>()
                        ?.FirstOrDefault(s => s.SheetId?.Value != null && s.SheetId.Value == definedName.LocalSheetId.Value + 1);
                    if(sheet?.Name != null)
                    {
                        sheetName = sheet.Name!;
                    }
                }
                string printAreaValue = definedName.Text;
                // SheetName is like SheetName!$A$1:$Z$20
                string[] ranges = printAreaValue.Split('!');                
                foreach(var r in ranges)
                {
                    string[] addresses = r.Split(":");
                    if(addresses.Length < 2)
                    {
                        continue;
                    }
                    (string columnNameStart, int rowStart) = GetCellAddress(addresses[0]);
                    if(string.IsNullOrEmpty(columnNameStart) || rowStart <= 0)
                    {
                        continue;
                    }
                    Worksheets.CellAddress startAddress = new (columnNameStart, ConvertAlphabetToIndex(columnNameStart),
                        rowStart);
                    (string columnNameEnd, int rowEnd) = GetCellAddress(addresses[1]);
                    if(string.IsNullOrEmpty(columnNameEnd) || rowEnd <= 0)
                    {
                        continue;
                    }
                    Worksheets.CellAddress endAddress = new (columnNameEnd, ConvertAlphabetToIndex(columnNameEnd),
                        rowEnd);

                    results.Add(new (startAddress, endAddress)); 
                }
            }
        }
        return results;
    }
...
Enter fullscreen mode Exit fullscreen mode

Width, Height

XlsFileReader.cs

...
    public class XlsFileReader(ILogger<XlsFileReader> Logger) : IXlsFileReader
    {
        private readonly double DefaultWidth = Numbers.ConvertFromPixelToCentimeter(8.38 * 7.0);
        private readonly double DefaultHeight = Numbers.ConvertFromPointToCentimeter(18.75);
        private static readonly Regex CellAddressRegex = new (@"\$([a-zA-Z]+)\$([0-9]+)");
        private static readonly Regex ColumnNameRegex = new ("([a-zA-Z]+)");
        public void Read(IFormFile file)
        {
            using SpreadsheetDocument spreadsheet = SpreadsheetDocument.Open(file.OpenReadStream(), false);
            WorkbookPart? bookPart = spreadsheet.WorkbookPart;
            List<string> sheetNames = GetSheetNameList(bookPart);
            foreach(var name in sheetNames)
            {
                Worksheet? targetSheet = GetWorksheet(bookPart, name);
                if(targetSheet == null)
                {
                    return;
                }
                List<Worksheets.PageArea> printAreas = GetPrintArea(bookPart, targetSheet);
                Worksheets.PageArea? target = printAreas.FirstOrDefault();
                List<Worksheets.ColumnWidth> widths = [];
                if(target != null)
                {
                    widths = GetColumnWidths(targetSheet, target.Start.Column, target.End.Column);
                }
                foreach(Row row in targetSheet.Descendants<Row>())
                {
                    double height = DefaultHeight;
                    if(row.Height?.Value != null)
                    {
                        // The row.Height value is expressed in points
                        // convert it to centimeters
                        height = Numbers.ConvertFromPointToCentimeter(row.Height.Value);
                    }
                    foreach(Cell cell in row.Cast<Cell>())
                    {
                        string columnName = GetColumnNameFromAddress(cell.CellReference);
                        double? width = widths.FirstOrDefault(w => w.ColumnName == columnName)?.Width;
                        width ??= DefaultWidth;
                        Worksheets.Cell? cellValue = GetCellValue(bookPart, cell, (double)width, height);
                    }
                }
                // Load first sheet for testing.
                break;
            }
        }
        private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell, double width, double height)
        {
...
            return new Worksheets.Cell
            {
                Address = cell.CellReference?.Value ?? "A1",
                Type = valueType,
                Value = value,            
                Width = width,
                Height = height,
                BackgroundColor = backgroundColor,
                Borders = borders,
            };
        }
...
        private List<Worksheets.ColumnWidth> GetColumnWidths(Worksheet sheet, int startColumn, int lastColumn)
        {
            Columns? columns = sheet.Descendants<Columns>().FirstOrDefault();
            if (columns == null)
            {
                return [];
            }
            List<Worksheets.ColumnWidth> results = [];
            for (int i = startColumn; i <= lastColumn; i++)
            {
                double columnWidth = DefaultWidth;
                if (columns != null)
                {
                    uint idx = (uint)i;
                    Column? column = columns.Elements<Column>().FirstOrDefault(c => 
                        c?.Min != null && c.Max != null && c.Min <= idx && c.Max >= idx);
                    if (column?.Width != null)
                    {
                        // The column.width value represents the number of characters (7 pixels by default)
                        // convert it to centimeters.
                        columnWidth = Numbers.ConvertFromPixelToCentimeter(column.Width * 7.0);
                    }
                }
                results.Add(new Worksheets.ColumnWidth(i, ConvertIndexToAlphabet(i), columnWidth));
            }
            return results;
        }
...
Enter fullscreen mode Exit fullscreen mode

Numbers.cs

namespace OfficeFileAccessor.Apps;
public static class Numbers
{
    private const double EmuToCm = 1.0 / 360000.0;
    private const double PointToCm = 0.0352778;
    private const double PixelToCm = 0.0264583;

    public static int ParseInt(string? value, int defaultValue)
    {
        if(string.IsNullOrEmpty(value))
        {
            return defaultValue;
        }
        if(int.TryParse(value, out var result))
        {
            return result;
        }
        return defaultValue;
    }
    public static double ConvertFromEMUToCentimeter(int emuValue)
    {
        return EmuToCm * (double)emuValue;
    }
    public static double ConvertFromPointToCentimeter(double pointValue)
    {
        return PointToCm * pointValue;
    }
    public static double ConvertFromPixelToCentimeter(double pixelValue)
    {
        return PixelToCm * pixelValue;
    }
}
Enter fullscreen mode Exit fullscreen mode

Top comments (0)