Files
Chamilo/vendor/ddeboer/data-import/src/Reader/CsvReader.php
2025-04-10 12:24:57 +02:00

415 lines
9.8 KiB
PHP

<?php
namespace Ddeboer\DataImport\Reader;
use Ddeboer\DataImport\Exception\DuplicateHeadersException;
/**
* Reads a CSV file, using as little memory as possible
*
* @author David de Boer <david@ddeboer.nl>
*/
class CsvReader implements CountableReader, \SeekableIterator
{
const DUPLICATE_HEADERS_INCREMENT = 1;
const DUPLICATE_HEADERS_MERGE = 2;
/**
* Number of the row that contains the column names
*
* @var integer
*/
protected $headerRowNumber;
/**
* CSV file
*
* @var \SplFileObject
*/
protected $file;
/**
* Column headers as read from the CSV file
*
* @var array
*/
protected $columnHeaders = [];
/**
* Number of column headers, stored and re-used for performance
*
* In case of duplicate headers, this is always the number of unmerged headers.
*
* @var integer
*/
protected $headersCount;
/**
* Total number of rows in the CSV file
*
* @var integer
*/
protected $count;
/**
* Faulty CSV rows
*
* @var array
*/
protected $errors = [];
/**
* Strict parsing - skip any lines mismatching header length
*
* @var boolean
*/
protected $strict = true;
/**
* How to handle duplicate headers
*
* @var integer
*/
protected $duplicateHeadersFlag;
/**
* @param \SplFileObject $file
* @param string $delimiter
* @param string $enclosure
* @param string $escape
*/
public function __construct(\SplFileObject $file, $delimiter = ',', $enclosure = '"', $escape = '\\')
{
ini_set('auto_detect_line_endings', true);
$this->file = $file;
$this->file->setFlags(
\SplFileObject::READ_CSV |
\SplFileObject::SKIP_EMPTY |
\SplFileObject::READ_AHEAD |
\SplFileObject::DROP_NEW_LINE
);
$this->file->setCsvControl(
$delimiter,
$enclosure,
$escape
);
}
/**
* Return the current row as an array
*
* If a header row has been set, an associative array will be returned
*
* @return array
*/
public function current()
{
// If the CSV has no column headers just return the line
if (empty($this->columnHeaders)) {
return $this->file->current();
}
// Since the CSV has column headers use them to construct an associative array for the columns in this line
do {
$line = $this->file->current();
// In non-strict mode pad/slice the line to match the column headers
if (!$this->isStrict()) {
if ($this->headersCount > count($line)) {
$line = array_pad($line, $this->headersCount, null); // Line too short
} else {
$line = array_slice($line, 0, $this->headersCount); // Line too long
}
}
// See if values for duplicate headers should be merged
if (self::DUPLICATE_HEADERS_MERGE === $this->duplicateHeadersFlag) {
$line = $this->mergeDuplicates($line);
}
// Count the number of elements in both: they must be equal.
if (count($this->columnHeaders) === count($line)) {
return array_combine(array_keys($this->columnHeaders), $line);
}
// They are not equal, so log the row as error and skip it.
if ($this->valid()) {
$this->errors[$this->key()] = $line;
$this->next();
}
} while($this->valid());
return null;
}
/**
* Get column headers
*
* @return array
*/
public function getColumnHeaders()
{
return array_keys($this->columnHeaders);
}
/**
* Set column headers
*
* @param array $columnHeaders
*/
public function setColumnHeaders(array $columnHeaders)
{
$this->columnHeaders = array_count_values($columnHeaders);
$this->headersCount = count($columnHeaders);
}
/**
* Set header row number
*
* @param integer $rowNumber Number of the row that contains column header names
* @param integer $duplicates How to handle duplicates (optional). One of:
* - CsvReader::DUPLICATE_HEADERS_INCREMENT;
* increments duplicates (dup, dup1, dup2 etc.)
* - CsvReader::DUPLICATE_HEADERS_MERGE; merges
* values for duplicate headers into an array
* (dup => [value1, value2, value3])
*
* @throws DuplicateHeadersException If duplicate headers are encountered
* and no duplicate handling has been
* specified
*/
public function setHeaderRowNumber($rowNumber, $duplicates = null)
{
$this->duplicateHeadersFlag = $duplicates;
$this->headerRowNumber = $rowNumber;
$headers = $this->readHeaderRow($rowNumber);
$this->setColumnHeaders($headers);
}
/**
* Rewind the file pointer
*
* If a header row has been set, the pointer is set just below the header
* row. That way, when you iterate over the rows, that header row is
* skipped.
*/
public function rewind()
{
$this->file->rewind();
if (null !== $this->headerRowNumber) {
$this->file->seek($this->headerRowNumber + 1);
}
}
/**
* {@inheritdoc}
*/
public function count()
{
if (null === $this->count) {
$position = $this->key();
$this->count = iterator_count($this);
$this->seek($position);
}
return $this->count;
}
/**
* {@inheritdoc}
*/
public function next()
{
$this->file->next();
}
/**
* {@inheritdoc}
*/
public function valid()
{
return $this->file->valid();
}
/**
* {@inheritdoc}
*/
public function key()
{
return $this->file->key();
}
/**
* {@inheritdoc}
*/
public function seek($pointer)
{
$this->file->seek($pointer);
}
/**
* {@inheritdoc}
*/
public function getFields()
{
return $this->getColumnHeaders();
}
/**
* Get a row
*
* @param integer $number Row number
*
* @return array
*/
public function getRow($number)
{
$this->seek($number);
return $this->current();
}
/**
* Get rows that have an invalid number of columns
*
* @return array
*/
public function getErrors()
{
if (0 === $this->key()) {
// Iterator has not yet been processed, so do that now
foreach ($this as $row) { /* noop */ }
}
return $this->errors;
}
/**
* Does the reader contain any invalid rows?
*
* @return boolean
*/
public function hasErrors()
{
return count($this->getErrors()) > 0;
}
/**
* Should the reader use strict parsing?
*
* @return boolean
*/
public function isStrict()
{
return $this->strict;
}
/**
* Set strict parsing
*
* @param boolean $strict
*/
public function setStrict($strict)
{
$this->strict = $strict;
}
/**
* Read header row from CSV file
*
* @param integer $rowNumber Row number
*
* @return array
*
* @throws DuplicateHeadersException
*/
protected function readHeaderRow($rowNumber)
{
$this->file->seek($rowNumber);
$headers = $this->file->current();
// Test for duplicate column headers
$diff = array_diff_assoc($headers, array_unique($headers));
if (count($diff) > 0) {
switch ($this->duplicateHeadersFlag) {
case self::DUPLICATE_HEADERS_INCREMENT:
$headers = $this->incrementHeaders($headers);
// Fall through
case self::DUPLICATE_HEADERS_MERGE:
break;
default:
throw new DuplicateHeadersException($diff);
}
}
return $headers;
}
/**
* Add an increment to duplicate headers
*
* So the following line:
* |duplicate|duplicate|duplicate|
* |first |second |third |
*
* Yields value:
* $duplicate => 'first', $duplicate1 => 'second', $duplicate2 => 'third'
*
* @param array $headers
*
* @return array
*/
protected function incrementHeaders(array $headers)
{
$incrementedHeaders = [];
foreach (array_count_values($headers) as $header => $count) {
if ($count > 1) {
$incrementedHeaders[] = $header;
for ($i = 1; $i < $count; $i++) {
$incrementedHeaders[] = $header . $i;
}
} else {
$incrementedHeaders[] = $header;
}
}
return $incrementedHeaders;
}
/**
* Merges values for duplicate headers into an array
*
* So the following line:
* |duplicate|duplicate|duplicate|
* |first |second |third |
*
* Yields value:
* $duplicate => ['first', 'second', 'third']
*
* @param array $line
*
* @return array
*/
protected function mergeDuplicates(array $line)
{
$values = [];
$i = 0;
foreach ($this->columnHeaders as $count) {
if (1 === $count) {
$values[] = $line[$i];
} else {
$values[] = array_slice($line, $i, $count);
}
$i += $count;
}
return $values;
}
}