From Wikipedia, The longest common subsequence (LCS) problem is to find the longest subsequence common to all sequences in a set of sequences (often just two).
The following is a VBA implementation of this problem. The following functions are included;
String functions;
- longestCommonSubsequence - calculate an LCS array.
- backTraceUp and backTraceLeft - trace back either defaulting up or left respectively, and find the LCS.
- getDiff - returns the difference of the two strings. The succeeding character of =, – or + indicated if the character was equal, removed or added, respectively.
- passGetDiffOutput - passes the output of getDiff so that =, – or + are now values in a 2 x n array, with indix 0 being equal, indix 1 being removed and indix 2 being added.
Array functions;
- longestCommonSubsequenceArr - calculate an LCS array.
- backTraceUpArr - trace back defaulting up and find the LCS.
- getDiffArr - returns the difference of the two arrays as a 2 x n array, with indix 0 being equal, indix 1 being removed and indix 2 being added.
Common functions;
- max - standard maximum function.
- stringToArray - convert a string to an array for array functions.
Examples;
Download the Basic (bas) File
Unfortunately, the limitations of VBA makes a dog’s dinner out of what would be some very concise code or, perhaps that’s just my implementation…
Option Explicit
Public Function longestCommonSubsequence(ByRef string1 As String, ByRef string2 As String) As Long()
If string1 = vbNullString Or string2 = vbNullString Then
Exit Function
End If
Dim num() As Long
'define the array, note rows of zeros get added to front automatically
ReDim num(Len(string1), Len(string2))
Dim i As Long, j As Long
For i = 1 To Len(string1)
For j = 1 To Len(string2)
If Mid$(string1, i, 1) = Mid$(string2, j, 1) Then
num(i, j) = num(i - 1, j - 1) + 1
Else
num(i, j) = max(num(i - 1, j), num(i, j - 1))
End If
Next j
Next i
longestCommonSubsequence = num
End Function
Sub exampleString()
Dim arr() As Long
Dim string1 As String
Dim string2 As String
string1 = "this is a find the haystack string"
string2 = "this is a replace the needle string"
arr = longestCommonSubsequence(string1, string2)
Dim s As String, t As String
s = backTraceUp(arr, string1, string2, Len(string1), Len(string2))
t = backTraceLeft(arr, string1, string2, Len(string1), Len(string2))
Dim a As String, b As String
a = getDiff(arr, string1, string2, Len(string1), Len(string2))
Dim brr() As Long
brr = passGetDiffOutput(a)
End Sub
Public Function max(ByRef a As Long, ByRef b As Long) As Long
If a >= b Then
max = a
Else
max = b
End If
End Function
'back traces c, defaulting in the up direction
Public Function backTraceUp(ByRef c() As Long, ByRef string1 As String, ByRef string2 As String, ByRef i As Long, ByRef j As Long) As String
If i < 1 Or j < 1 Then
backTraceUp = vbNullString
ElseIf Mid$(string1, i, 1) = Mid$(string2, j, 1) Then
'equal characters, save it and then go up and left
backTraceUp = backTraceUp(c, string1, string2, i - 1, j - 1) & Mid$(string1, i, 1)
Else
'go in the direction of the highest number, defaulting to up
If (c(i, j - 1) > c(i - 1, j)) Then
backTraceUp = backTraceUp(c, string1, string2, i, j - 1)
Else
backTraceUp = backTraceUp(c, string1, string2, i - 1, j)
End If
End If
End Function
'back traces c, defaulting in the left direction
Public Function backTraceLeft(ByRef c() As Long, ByRef string1 As String, ByRef string2 As String, ByRef i As Long, ByRef j As Long) As String
If i < 1 Or j < 1 Then
backTraceLeft = vbNullString
ElseIf Mid$(string1, i, 1) = Mid$(string2, j, 1) Then
'equal characters, save it and then go up and left
backTraceLeft = backTraceLeft(c, string1, string2, i - 1, j - 1) & Mid$(string1, i, 1)
Else
'go in the direction of the highest number, defaulting to left
If (c(i, j - 1) >= c(i - 1, j)) Then
backTraceLeft = backTraceLeft(c, string1, string2, i, j - 1)
Else
backTraceLeft = backTraceLeft(c, string1, string2, i - 1, j)
End If
End If
End Function
'the following function returns a string with indication to what was deleted or added
'proceding character can be;
' = no change
' - deletion
' + addition
Public Function getDiff(ByRef c() As Long, ByRef stringOld As String, ByRef stringNew As String, ByRef i As Long, ByRef j As Long) As String
If i > 0 Then
If j > 0 Then 'both are greater than zero
'can only do the following comparison when i and j are greater than zero
If Mid$(stringOld, i, 1) = Mid$(stringNew, j, 1) Then
getDiff = getDiff(c, stringOld, stringNew, i - 1, j - 1) & Mid$(stringOld, i, 1) & "="
Else
If i = 0 Then
getDiff = getDiff(c, stringOld, stringNew, i, j - 1) & Mid$(stringNew, j, 1) & "+"
ElseIf c(i, j - 1) >= c(i - 1, j) Then
getDiff = getDiff(c, stringOld, stringNew, i, j - 1) & Mid$(stringNew, j, 1) & "+"
ElseIf j = 0 Then
getDiff = getDiff(c, stringOld, stringNew, i - 1, j) & Mid$(stringOld, i, 1) & "-"
ElseIf c(i, j - 1) < c(i - 1, j) Then
getDiff = getDiff(c, stringOld, stringNew, i - 1, j) & Mid$(stringOld, i, 1) & "-"
Else
getDiff = vbNullString
End If
End If
Else 'i is is greater than zero
If j = 0 Then
getDiff = getDiff(c, stringOld, stringNew, i - 1, j) & Mid$(stringOld, i, 1) & "-"
ElseIf c(i, j - 1) < c(i - 1, j) Then
getDiff = getDiff(c, stringOld, stringNew, i - 1, j) & Mid$(stringOld, i, 1) & "-"
Else
getDiff = vbNullString
End If
End If
Else
If j > 0 Then 'j is greater than zero
If i = 0 Then
getDiff = getDiff(c, stringOld, stringNew, i, j - 1) & Mid$(stringNew, j, 1) & "+"
ElseIf c(i, j - 1) >= c(i - 1, j) Then
getDiff = getDiff(c, stringOld, stringNew, i, j - 1) & Mid$(stringNew, j, 1) & "+"
Else
getDiff = vbNullString
End If
Else 'none are greater than zero
getDiff = vbNullString
End If
End If
End Function
'this function returns the location of the string difference
Public Function passGetDiffOutput(ByRef outputStr As String) As Long()
Dim i As Long
i = 1
Dim typeChr As String
Dim oldi As Long
Dim newi As Long
oldi = 0
newi = 0
Dim toFrom() As Long
Dim toFromCount As Long
toFromCount = -1
Dim typeChrPrev As String
typeChrPrev = vbNullString
Do While i < Len(outputStr)
typeChr = Mid$(outputStr, i + 1, 1)
Select Case typeChr
Case "="
If typeChr <> typeChrPrev Then
'check if it is comming from a deletion
If typeChrPrev = "-" Then
toFrom(2, toFromCount) = oldi
End If
'check if it is comming from a addition
If typeChrPrev = "+" Then
toFrom(2, toFromCount) = newi
End If
End If
oldi = oldi + 1 'update old index
newi = newi + 1 'update new index
Case "-"
'check if it is comming from a addition
If typeChrPrev = "+" Then
toFrom(2, toFromCount) = newi
End If
oldi = oldi + 1 'update old index
If typeChr <> typeChrPrev Then
toFromCount = toFromCount + 1
ReDim Preserve toFrom(2, toFromCount)
'let old be -1
toFrom(0, toFromCount) = -1
toFrom(1, toFromCount) = oldi
End If
Case "+"
'check if it is comming from a deletion
If typeChrPrev = "-" Then
toFrom(2, toFromCount) = oldi
End If
newi = newi + 1 'update new index
If typeChr <> typeChrPrev Then
toFromCount = toFromCount + 1
ReDim Preserve toFrom(2, toFromCount)
'let new be 1
toFrom(0, toFromCount) = 1
toFrom(1, toFromCount) = newi
End If
End Select
i = i + 2
typeChrPrev = typeChr
Loop
'check if it ended on a deletion or adition
If typeChrPrev = "-" Then
toFrom(2, toFromCount) = oldi
End If
If typeChrPrev = "+" Then
toFrom(2, toFromCount) = newi
End If
passGetDiffOutput = toFrom
End Function
'note, arrays must be single dimension
Public Function longestCommonSubsequenceArr(ByRef array1() As String, ByRef array2() As String) As Long()
On Error Resume Next
If UBound(array1, 2) > 0 Or UBound(array2, 2) > 0 Then 'multidimensional arrays
If Error = vbNullString Then
Exit Function
End If
End If
If UBound(array1) < 0 Or UBound(array2) < 0 Then 'check if arrays are bounded
If Error <> vbNullString Then
Exit Function
End If
End If
Dim num() As Long
'define the array, note rows of zeros get added to front automatically
ReDim num(UBound(array1) + 1, UBound(array2) + 1)
Dim i As Long, j As Long
'note, arrays must always start at indice zero.
For i = 0 To UBound(array1)
For j = 0 To UBound(array2)
If array1(i) = array2(j) Then
num(i + 1, j + 1) = num(i, j) + 1
Else
num(i + 1, j + 1) = max(num(i, j + 1), num(i + 1, j))
End If
Next j
Next i
longestCommonSubsequenceArr = num
End Function
Public Function stringToArray(ByRef str As String) As String()
Dim i As Long
Dim arr() As String
ReDim arr(Len(str) - 1)
For i = 1 To Len(str)
arr(i - 1) = Mid$(str, i, 1)
Next i
stringToArray = arr
End Function
Sub exampleArr()
Dim string1 As String
Dim string2 As String
string1 = "this is a find the haystack string"
string2 = "this is a replace the needle string"
Dim a1() As String
Dim a2() As String
a1 = stringToArray(string1)
a2 = stringToArray(string2)
Dim c() As Long
c = longestCommonSubsequenceArr(a1, a2)
Dim str() As String
str = backTraceUpArr(c, a1, a2, UBound(a1), UBound(a2))
Dim dif() As String
dif = getDiffArr(c, a1, a2, UBound(a1), UBound(a2))
End Sub
'back traces c, defaulting in the up direction
Public Function backTraceUpArr(ByRef c() As Long, ByRef array1() As String, ByRef array2() As String, ByRef i As Long, ByRef j As Long) As String()
Dim arr() As String
If i < 0 Or j < 0 Then
backTraceUpArr = arr
ElseIf array1(i) = array2(j) Then
'equal characters, save it and then go up and left
arr = backTraceUpArr(c, array1, array2, i - 1, j - 1)
'check the bounding of arr
Dim bound As Long
On Error Resume Next
bound = UBound(arr)
If Error <> vbNullString Then
ReDim arr(0)
arr(0) = array1(i)
Else 'no error
ReDim Preserve arr(bound + 1)
arr(bound + 1) = array1(i)
End If
backTraceUpArr = arr
Else
'go in the direction of the highest number, defaulting to up
If (c(i + 1, j) > c(i, j + 1)) Then
backTraceUpArr = backTraceUpArr(c, array1, array2, i, j - 1)
Else
backTraceUpArr = backTraceUpArr(c, array1, array2, i - 1, j)
End If
End If
End Function
'returns a 2xn array, where
'indice 0 are equal
'indice 1 are deletions
'indice 2 are additions
Public Function getDiffArr(ByRef c() As Long, ByRef arrayOld() As String, ByRef arrayNew() As String, ByRef i As Long, ByRef j As Long) As String()
Dim arr() As String
Dim bound As Long
On Error Resume Next
If i >= 0 Then
If j >= 0 Then 'both are greater or equal to zero
'can only do the following comparison when i and j are greater or equal than zero
If arrayOld(i) = arrayNew(j) Then
arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j - 1)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(0, 0) = arrayOld(i)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(0, bound + 1) = arrayOld(i)
End If
getDiffArr = arr
Else
If i = 0 Then
arr = getDiffArr(c, arrayOld, arrayNew, i, j - 1)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(2, 0) = arrayNew(j)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(2, bound + 1) = arrayNew(j)
End If
getDiffArr = arr
ElseIf c(i + 1, j - 1 + 1) >= c(i - 1 + 1, j + 1) Then
arr = getDiffArr(c, arrayOld, arrayNew, i, j - 1)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(2, 0) = arrayNew(j)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(2, bound + 1) = arrayNew(j)
End If
getDiffArr = arr
ElseIf j = 0 Then
arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(1, 0) = arrayOld(i)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(1, bound + 1) = arrayOld(i)
End If
getDiffArr = arr
ElseIf c(i + 1, j - 1 + 1) < c(i - 1 + 1, j + 1) Then
arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(1, 0) = arrayOld(i)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(1, bound + 1) = arrayOld(i)
End If
getDiffArr = arr
Else
getDiffArr = arr
End If
End If
Else 'i is is greater or equal to zero
If j = 0 Then
arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(1, 0) = arrayOld(i)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(1, bound + 1) = arrayOld(i)
End If
getDiffArr = arr
ElseIf c(i + 1, j - 1 + 1) < c(i - 1 + 1, j + 1) Then
arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(1, 0) = arrayOld(i)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(1, bound + 1) = arrayOld(i)
End If
getDiffArr = arr
Else
getDiffArr = arr
End If
End If
Else
If j >= 0 Then 'j is greater than zero
If i = 0 Then
arr = getDiffArr(c, arrayOld, arrayNew, i - 1, j)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(2, 0) = arrayNew(j)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(2, bound + 1) = arrayNew(j)
End If
getDiffArr = arr
ElseIf c(i + 1, j - 1 + 1) >= c(i - 1 + 1, j + 1) Then
arr = getDiffArr(c, arrayOld, arrayNew, i, j - 1)
bound = UBound(arr, 2) 'check the bounding of arr
If Error <> vbNullString Then
Err.Clear
ReDim arr(2, 0)
arr(2, 0) = arrayNew(j)
Else 'no error
ReDim Preserve arr(2, bound + 1)
arr(2, bound + 1) = arrayNew(j)
End If
getDiffArr = arr
Else
getDiffArr = arr
End If
Else 'none are greater than zero
getDiffArr = arr
End If
End If
End Function
