GsmMessageSplitter.java

package fr.sii.ogham.sms.splitter;

import static fr.sii.ogham.sms.SmsConstants.SmppSplitConstants.MAXIMUM_BYTES_PER_MESSAGE;
import static java.util.Arrays.asList;

import java.util.ArrayList;
import java.util.List;

import fr.sii.ogham.sms.SmsConstants.SmppSplitConstants.SegmentSizes;
import fr.sii.ogham.sms.encoder.EncodedWithHeader;
import fr.sii.ogham.sms.encoder.Encoder;
import fr.sii.ogham.sms.exception.message.EncodingException;
import fr.sii.ogham.sms.exception.message.InvalidReferenceNumberException;
import fr.sii.ogham.sms.exception.message.ReferenceNumberGenerationException;
import fr.sii.ogham.sms.exception.message.SplitMessageException;

/**
 * Split the message in segments if needed.
 * 
 * <p>
 * If the size of the unencoded Java {@link String} is less than the provided
 * maximum size then no split is done. The result is a list of only one segment
 * with the encoded message as byte array.
 * 
 * <p>
 * If the size of the unencoded Java {@link String} is greater than the provided
 * maximum size then the message is split. Encoded message byte array is cut to
 * fit the provided maximum segment size. The result is a list of segments that
 * contains the required headers and the partial byte array.
 * 
 * <p>
 * If the message is split, each segment contains a header and a payload. The
 * header follows <a href="https://en.wikipedia.org/wiki/User_Data_Header">User
 * Data Header</a> specification.
 * 
 * <p>
 * The specification also allows to use extended table. Even if the encoding
 * uses only one octet (GSM 7-bit encoding and GSM 8-bit encoding), the
 * characters that are present in the extended table must allocate two octets (1
 * for ESC character followed with extended character). That's why this splitter
 * may need to use a {@link LengthCounter}. If such a character is present then
 * the number of characters that could fit in a segment is decreased by one.
 * 
 * <p>
 * Each segment contains a reference number to identify
 * <a href="https://en.wikipedia.org/wiki/Concatenated_SMS">concatenated
 * messages</a>. The reference number can be encoded on one or two octets (see
 * {@link ReferenceNumberGenerator}). This algorithm supports both reference
 * numbers encoded on one or two octets.
 * 
 * 
 * <hr>
 * <strong><u>Explanation</u></strong>
 * 
 * 
 * <p>
 * <strong>One-octet encoding</strong><br>
 * If every character of the original string is encoded on one octet and the
 * maximum size for segments is 12 octets. Then the maximum unencoded characters
 * that can fit in a single segment is also 12.
 * 
 * <pre>
 * {@code 
 * String originalMessage = "Hello World!"
 * // Not really encoded, just to explain. 
 * // Use back-tick to indicate that it is the octet value of the character
 * byte[] encoded = [`H`, `e`, `l`, `l`, `o`, ` ` , `W`, `o`, `r`, `l`, `d`, `!`]
 * // The message can fit entirely in a single segment
 * ┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
 * │`H`│`e`│`l`│`l`│`o`│` `│`W`│`o`│`r`│`l`│`d`│`!`│
 * └───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
 * 
 * String originalMessage = "Hello World !!"
 * // Not really encoded, just to explain. 
 * // Use back-tick to indicate that it is the octet value of the character
 * byte[] encoded = [`H`, `e`, `l`, `l`, `o`, ` ` , `W`, `o`, `r`, `l`, `d`, ` `, `!`, `!`]
 * // The message can't fit entirely in a single segment so it must be split
 * // Header (6 octets) is added on each segment
 * // So message is split in 3 segments like this
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 1 ║`H`│`e`│`l`│`l`│`o`│` `│
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┘
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 2 ║`W`│`o`│`r`│`l`│`d`│` `│
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┘
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 3 ║`!`│`!`│   │   │   │   │
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┘
 *   │   │   │   │   │   │ 
 *   │   │   │   │   │ This segment's number
 *   │   │   │   │   │ in the sequence
 *   │   │   │   │   │ 
 *   │   │   │   │ Total number of 
 *   │   │   │   │ segments
 *   │   │   │   │ 
 *   │   │   │ CSMS reference number
 *   │   │   │ Generated by
 *   │   │   │ ReferenceNumberGenerator
 *   │   │   │ 
 *   │   │  Length of the header,
 *   │   │  excluding the first two
 *   │   │  fields
 *   │   │
 *   │  Information Element Identifier
 *   │
 *  Length of User Data Header
 * }
 * </pre>
 * 
 * 
 * <p>
 * <strong>Two-octet encoding</strong><br>
 * If every character of the original string is encoded on two octets and the
 * maximum size for segments is 12 octets. Then the maximum unencoded characters
 * that can fit in a single segment is 6 (12 / 2).
 * 
 * <pre>
 * {@code 
 * String originalMessage = "Hello!"
 * // Each character is encoded on two octets
 * byte[] encoded = [0, 72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 33]
 * // The message can fit entirely in a single segment
 * ┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
 * │ 0 │72 │ 0 │101│ 0 │108│ 0 │108│ 0 │111│ 0 │33 │
 * └───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
 * 
 * String originalMessage = "Hello !!"
 * // Each character is encoded on two octets
 * byte[] encoded = [0, 72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 33, 0, 33]
 * // The message can't fit entirely in a single segment so it must be split
 * // Header (6 octets) is added on each segment
 * // So message is split in 3 segments like this
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 1 ║ 0 │72 │ 0 │101│ 0 │108│
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┘
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 2 ║ 0 │108│ 0 │111│ 0 │32 │
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┘
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 3 ║ 0 │33 │ 0 │33 │   │   │
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┘
 *   │   │   │   │   │   │ 
 *   │   │   │   │   │ This segment's number
 *   │   │   │   │   │ in the sequence
 *   │   │   │   │   │ 
 *   │   │   │   │ Total number of 
 *   │   │   │   │ segments
 *   │   │   │   │ 
 *   │   │   │ CSMS reference number
 *   │   │   │ Generated by
 *   │   │   │ ReferenceNumberGenerator
 *   │   │   │ 
 *   │   │  Length of the header,
 *   │   │  excluding the first two
 *   │   │  fields
 *   │   │
 *   │  Information Element Identifier
 *   │
 *  Length of User Data Header
 * }
 * </pre>
 * 
 * <p>
 * <strong>7-bits encoding</strong><br>
 * If every character of the original string is encoded on 7 bits and the
 * maximum size for segments is 14 octets. Then the maximum unencoded characters
 * that can fit in a single segment is 16.
 * 
 * <pre>
 * {@code 
 * String originalMessage = "aaaaaaaaaaaaaaaa"
 * The message can fit entirely in a single segment
 * encoded on 7 bits 'a' is: 1100001
 * originalMessage encoded on 7 bits is:
 *   1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001
 * originalMessage packed on 8 bits is (/!\ This is not the real packing algorithm but it is simpler to understand):
 *   11000011 10000111 00001110 00011100 00111000 01110000 11100001 11000011 10000111 00001110 00011100 00111000 01110000 11100001
 *     #c3      #87      #0e      #1c      #38      #70      #e1       #c3      #87      #0e      #1c      #38      #70      #e1  
 * byte[] encoded = [#c3, #87, #0e, #1c, #38, #70, #e1, #c3, #87, #0e, #1c, #38, #70, #e1]
 * ┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
 * │#c3│#87│#0e│#1c│#38│#70│#e1│#c3│#87│#0e│#1c│#38│#70│#e1│
 * └───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
 * 
 * String originalMessage = "aaaaaaaaaaaaaaaabbbb"
 * The message can't fit entirely in a single segment so it must be split
 * Header (6 octets) is added on each segment 
 * so each payload can contain 9 characters: (14 octets - 6 header octets) * 8 bits / 7 bits per char
 * originalMessage must be split like this (before encoding):
 * ┌─────────┬─────────┬─────────┐
 * │aaaaaaaaa│aaaaaaabb│bb       │
 * └─────────┴─────────┴─────────┘
 *   part1     part2     part3
 *   
 * encoded on 7 bits 'a' is: 1100001
 * encoded on 7 bits 'b' is: 1100010
 * part1 encoded on 7 bits is:
 *   1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100001
 * part2 encoded on 7 bits is:
 *   1100001 1100001 1100001 1100001 1100001 1100001 1100001 1100010 1100010
 * part3 encoded on 7 bits is:
 *   1100010 1100010
 * part1 packed on 8 bits is (/!\ This is not the real packing algorithm but it is simpler to understand):
 *   11000011 10000111 00001110 00011100 00111000 01110000 11100001 1100001
 *     #c3      #87      #0e      #1c      #38      #70      #e1       #c2   
 * part2 packed on 8 bits is (/!\ This is not the real packing algorithm but it is simpler to understand):
 *   11000011 10000111 00001110 00011100 00111000 01110000 11100010 1100010
 *     #c3      #87      #0e      #1c      #38      #70      #e2       #c4   
 * part3 packed on 8 bits is (/!\ This is not the real packing algorithm but it is simpler to understand):
 *   11000101 100010
 *     #c5      #88
 * 
 * So message is split in 3 segments like this
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 1 ║#c3│#87│#0e│#1c│#38│#70│#e1│#c2│
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┴───┴───┘
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 2 ║#c3│#87│#0e│#1c│#38│#70│#e2│#c4│
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┴───┴───┘
 * ┌───┬───┬───┬───┬───┬───╥───┬───┬───┬───┬───┬───┬───┬───┐
 * │#05│#00│#03│#??│ 3 │ 3 ║#c5│#88│   │   │   │   │   │   │
 * └───┴───┴───┴───┴───┴───╨───┴───┴───┴───┴───┴───┴───┴───┘
 *   │   │   │   │   │   │ 
 *   │   │   │   │   │ This segment's number
 *   │   │   │   │   │ in the sequence
 *   │   │   │   │   │ 
 *   │   │   │   │ Total number of 
 *   │   │   │   │ segments
 *   │   │   │   │ 
 *   │   │   │ CSMS reference number
 *   │   │   │ Generated by
 *   │   │   │ ReferenceNumberGenerator
 *   │   │   │ 
 *   │   │  Length of the header,
 *   │   │  excluding the first two
 *   │   │  fields
 *   │   │
 *   │  Information Element Identifier
 *   │
 *  Length of User Data Header
 * }
 * </pre>
 * 
 * 
 * @author Aurélien Baudet
 *
 */
public class GsmMessageSplitter implements MessageSplitter {
	private static final int MAXIMUM_SEGMENTS = 255;
	
	private static final int USER_DATA_HEADER_SIZE_ONE_BYTE_REFERENCE_NUMBER = 6;
	private static final byte UDHIE_HEADER_LENGTH_ONE_BYTE_REFERENCE_NUMBER = 0x05;
	private static final byte UDHIE_IDENTIFIER_SAR_ONE_BYTE_REFERENCE_NUMBER = 0x00;
	private static final byte UDHIE_SAR_LENGTH_ONE_BYTE_REFERENCE_NUMBER = 0x03;

	private static final int USER_DATA_HEADER_SIZE_TWO_BYTES_REFERENCE_NUMBER = 7;
	private static final byte UDHIE_HEADER_LENGTH_TWO_BYTES_REFERENCE_NUMBER = 0x06;
	private static final byte UDHIE_IDENTIFIER_SAR_TWO_BYTES_REFERENCE_NUMBER = 0x08;
	private static final byte UDHIE_SAR_LENGTH_TWO_BYTES_REFERENCE_NUMBER = 0x04;

	private final Encoder encoder;
	private final SegmentSizes segmentSizes;
	private final ReferenceNumberGenerator referenceNumberGenerator;
	private final LengthCounter lengthCounter;

	/**
	 * The splitter uses the {@link Encoder} to encode each segment.
	 * 
	 * <p>
	 * The algorithm compares the length of the Java String (using
	 * {@link String#length()} with
	 * {@link SegmentSizes#getMaximumStringLengthToFitInASingleSegment()} to
	 * check that the whole string can fit in a single segment. If it can't then
	 * split is applied. A reference number is generated (using
	 * {@link RandomReferenceNumberGenerator}). The algorithm uses
	 * {@link SegmentSizes#getMaximumStringLengthPerSegment()} to compute the
	 * remaining of characters that can fit in a segment with a header. The size
	 * of the header depends on the size of the reference number. Once the
	 * string is split in a segment, it is also encoded using {@link Encoder}.
	 * 
	 * @param encoder
	 *            the encoder to encode message
	 * @param segmentSizes
	 *            the information about size that can fit in one segment
	 *            (depends on encoder)
	 */
	public GsmMessageSplitter(Encoder encoder, SegmentSizes segmentSizes) {
		this(encoder, segmentSizes, new RandomReferenceNumberGenerator());
	}

	/**
	 * The splitter uses the {@link Encoder} to encode each segment.
	 * 
	 * <p>
	 * The algorithm compares the length of the Java String (using
	 * {@link String#length()} with
	 * {@link SegmentSizes#getMaximumStringLengthToFitInASingleSegment()} to
	 * check that the whole string can fit in a single segment. If it can't then
	 * split is applied. A reference number is generated (using
	 * {@link ReferenceNumberGenerator}). The algorithm uses
	 * {@link SegmentSizes#getMaximumStringLengthPerSegment()} to compute the
	 * remaining of characters that can fit in a segment with a header. The size
	 * of the header depends on the size of the reference number. Once the
	 * string is split in a segment, it is also encoded using {@link Encoder}.
	 * 
	 * @param encoder
	 *            the encoder to encode message
	 * @param segmentSizes
	 *            the information about size that can fit in one segment
	 *            (depends on encoder)
	 * @param referenceNumberGenerator
	 *            generates reference numbers
	 */
	public GsmMessageSplitter(Encoder encoder, SegmentSizes segmentSizes, ReferenceNumberGenerator referenceNumberGenerator) {
		this(encoder, segmentSizes, referenceNumberGenerator, String::length);
	}

	/**
	 * The splitter uses the {@link Encoder} to encode each segment.
	 * 
	 * <p>
	 * The algorithm compares {@link LengthCounter#count(String)} with
	 * {@link SegmentSizes#getMaximumStringLengthToFitInASingleSegment()} to
	 * check that the whole string can fit in a single segment. If it can't then
	 * split is applied. A reference number is generated (using
	 * {@link ReferenceNumberGenerator}). The algorithm uses
	 * {@link SegmentSizes#getMaximumStringLengthPerSegment()} to compute the
	 * remaining of characters that can fit in a segment with a header. The size
	 * of the header depends on the size of the reference number. Once the
	 * string is split in a segment, it is also encoded using {@link Encoder}.
	 * 
	 * @param encoder
	 *            the encoder to encode message
	 * @param segmentSizes
	 *            the information about size that can fit in one segment
	 *            (depends on encoder)
	 * @param referenceNumberGenerator
	 *            generates reference numbers
	 * @param lengthCounter
	 *            used to count the number of characters in the string (some
	 *            characters may not have the same size, using extended
	 *            character tables for example)
	 */
	public GsmMessageSplitter(Encoder encoder, SegmentSizes segmentSizes, ReferenceNumberGenerator referenceNumberGenerator, LengthCounter lengthCounter) {
		super();
		this.encoder = encoder;
		this.segmentSizes = segmentSizes;
		this.referenceNumberGenerator = referenceNumberGenerator;
		this.lengthCounter = lengthCounter;
	}

	@Override
	public List<Segment> split(String message) throws SplitMessageException {
		int messageLength = lengthCounter.count(message);
		if (messageLength <= segmentSizes.getMaximumStringLengthToFitInASingleSegment()) {
			return asList(singleSegment(message));
		}

		// generate new reference number
		byte[] referenceNumber = generateReferenceNumber(message);
		int maximumStringLengthPerSegment = computeMaximumStringLengthPerSegment(referenceNumber);

		// split into several messages
		int numberOfSegments = (int) Math.ceil(messageLength / (double) maximumStringLengthPerSegment);
		if (numberOfSegments > MAXIMUM_SEGMENTS) {
			throw new SplitMessageException("Can't split the message because the number of segments is greater than 255", message);
		}

		// prepare list for all of the msg segments
		List<Segment> segments = new ArrayList<>(numberOfSegments);

		int start = 0;
		for (int i = 0; i < numberOfSegments; i++) {
			String part = cutToFitInSegment(start, message, maximumStringLengthPerSegment);
			segments.add(segmentWithHeader(message, part, numberOfSegments, i + 1, referenceNumber));
			start += part.length();
		}
		return segments;
	}

	private byte[] generateReferenceNumber(String message) throws SplitMessageException {
		try {
			byte[] referenceNumber = referenceNumberGenerator.generateReferenceNumber();
			if (referenceNumber == null || referenceNumber.length == 0) {
				throw new InvalidReferenceNumberException("Generated reference number byte array can't be null or empty", referenceNumber);
			}
			if (referenceNumber.length > 2) {
				throw new InvalidReferenceNumberException(GsmMessageSplitter.class.getSimpleName() + " only support one byte or two byte reference number length", referenceNumber);
			}
			return referenceNumber;
		} catch (ReferenceNumberGenerationException e) {
			throw new SplitMessageException("Failed to split message due to reference number generation failure", message, e);
		}
	}

	private String cutToFitInSegment(int start, String message, int maximumStringLengthPerSegment) {
		int end = start + maximumStringLengthPerSegment;
		String part = message.substring(start, Math.min(message.length(), end));
		int lengthOfPart = lengthCounter.count(part);
		while (lengthOfPart > maximumStringLengthPerSegment && end > start) {
			end--;
			part = message.substring(start, Math.min(message.length(), end));
			lengthOfPart = lengthCounter.count(part);
		}
		return part;
	}

	private Segment segmentWithHeader(String wholeMessage, String part, int numberOfSegments, int segmentNumber, byte[] referenceNumber) throws SplitMessageException {
		try {
			int headerSize = headerSize(referenceNumber);
			byte[] header = new byte[headerSize];

			if (referenceNumber.length == 1) {
				// Field 1 (1 octet): Length of User Data Header, in this case
				// 05.
				header[0] = UDHIE_HEADER_LENGTH_ONE_BYTE_REFERENCE_NUMBER;
				// Field 2 (1 octet): Information Element Identifier, equal to
				// 00 (Concatenated short messages, 8-bit reference number)
				header[1] = UDHIE_IDENTIFIER_SAR_ONE_BYTE_REFERENCE_NUMBER;
				// Field 3 (1 octet): Length of the header, excluding the first
				// two fields; equal to 03 for one byte reference number
				header[2] = UDHIE_SAR_LENGTH_ONE_BYTE_REFERENCE_NUMBER;
				// Field 4 (1 octet): 00-FF, CSMS reference number, must be same
				// for all the SMS parts in the CSMS.
				header[3] = referenceNumber[0];
				// Field 5 (1 octet): 00-FF, total number of parts. The value
				// shall remain constant for every short message which makes up
				// the concatenated short message. If the value is zero then the
				// receiving entity shall ignore the whole information element
				header[4] = (byte) numberOfSegments;
				// Field 6 (1 octet): 00-FF, this part's number in the sequence.
				// The value shall start at 1 and increment for every short
				// message which makes up the concatenated short message. If the
				// value is zero or greater than the value in Field 5 then the
				// receiving entity shall ignore the whole information element.
				// [ETSI Specification: GSM 03.40 Version 5.3.0: July 1996]
				header[5] = (byte) segmentNumber;
			} else {
				// Field 1 (1 octet): Length of User Data Header, in this case
				// 06.
				header[0] = UDHIE_HEADER_LENGTH_TWO_BYTES_REFERENCE_NUMBER;
				// Field 2 (1 octet): Information Element Identifier, equal to
				// 08 (Concatenated short messages, 16-bit reference number)
				header[1] = UDHIE_IDENTIFIER_SAR_TWO_BYTES_REFERENCE_NUMBER;
				// Field 3 (1 octet): Length of the header, excluding the first
				// two fields; equal to 04 for one byte reference number
				header[2] = UDHIE_SAR_LENGTH_TWO_BYTES_REFERENCE_NUMBER;
				// Field 4 (2 octets): 0000-FFFF, CSMS reference number, must be
				// same for all the SMS parts in the CSMS.
				header[3] = referenceNumber[0];
				header[4] = referenceNumber[1];
				// Field 6 (1 octet): 00-FF, total number of parts. The value
				// shall remain constant for every short message which makes up
				// the concatenated short message. If the value is zero then the
				// receiving entity shall ignore the whole information element
				header[5] = (byte) numberOfSegments;
				// Field 7 (1 octet): 00-FF, this part's number in the sequence.
				// The value shall start at 1 and increment for every short
				// message which makes up the concatenated short message. If the
				// value is zero or greater than the value in Field 5 then the
				// receiving entity shall ignore the whole information element.
				// [ETSI Specification: GSM 03.40 Version 5.3.0: July 1996]
				header[6] = (byte) segmentNumber;
			}

			return new EncodedSegment(new EncodedWithHeader(header, encoder.encode(part)));
		} catch (EncodingException e) {
			throw new SplitMessageException("Failed to generate segment for " + part + " (segment " + segmentNumber + "/" + numberOfSegments + ") due to encoding error", wholeMessage, e);
		}
	}

	private int computeMaximumStringLengthPerSegment(byte[] referenceNumber) {
		return (int) Math.floor((MAXIMUM_BYTES_PER_MESSAGE - headerSize(referenceNumber)) * segmentSizes.getMaximumStringLengthToFitInASingleSegment() / (double) MAXIMUM_BYTES_PER_MESSAGE);
	}

	private static int headerSize(byte[] referenceNumber) {
		return referenceNumber.length == 1 ? USER_DATA_HEADER_SIZE_ONE_BYTE_REFERENCE_NUMBER : USER_DATA_HEADER_SIZE_TWO_BYTES_REFERENCE_NUMBER;
	}

	private Segment singleSegment(String message) throws SplitMessageException {
		try {
			return new EncodedSegment(encoder.encode(message));
		} catch (EncodingException e) {
			throw new SplitMessageException("Failed to generate single segment for " + message + " due to encoding error", message, e);
		}
	}

}