Browse Source

icinga2: Add check_syncrepl_extended

Signed-off-by: Maximilian Wilhelm <max@sdn.clinic>
Maximilian Wilhelm 2 months ago
parent
commit
e63e66a2ec

+ 24 - 0
icinga2/commands.d/syncrepl_extended.conf

@@ -0,0 +1,24 @@
+#
+# Check LDAP replication
+#
+
+object CheckCommand "syncrepl_extended" {
+	import "plugin-check-command"
+
+	command = [  "/usr/local/share/monitoring-plugins/check_syncrepl_extended" ]
+
+	arguments = {
+		"--provider" = "$provider$"
+		"--consumer" = "$consumer$"
+		"--serverID" = "$serverid$"
+		"--starttls" = {
+			set_if = "$starttls$"
+		}
+		"--dn"       = "$bind_dn$"
+		"--pwd"      = "$bind_password$"
+		"--basedn"   = "$base_dn$"
+		"--nagios" = {
+			set_if = true
+		}
+	}
+}

+ 673 - 0
icinga2/plugins/check_syncrepl_extended

@@ -0,0 +1,673 @@
+#!/usr/bin/env python3
+
+#
+# Script to check LDAP syncrepl replication state between two servers.
+# One server is consider as provider and the other as consumer.
+#
+# This script can check replication state with two method :
+#  - by the fisrt, entryCSN of all entries of LDAP directory will be
+#    compare between two servers
+#  - by the second, all values of all atributes of all entries will
+#    be compare between two servers.
+#
+# In all case, contextCSN of servers will be compare and entries not
+# present in consumer or in provider will be notice. You can decide to
+# disable contextCSN verification by using argument --no-check-contextCSN.
+#
+# This script is also able to "touch" LDAP object on provider to force
+# synchronisation of this object. This mechanism consist to add '%%TOUCH%%'
+# value to an attribute of this object and remove it just after. The
+# touched attribute is specify by parameter --touch. Of course, couple of
+# DN and password provided, must have write right on this attribute.
+#
+# If your prefer, you can use --replace-touch parameter to replace value
+# of touched attribute instead of adding the touched value. Use-ful in
+# case of single-value attribute.
+#
+# This script could be use as Nagios plugin (-n argument)
+#
+# Requirement:
+# A single couple of DN and password able to connect to both server
+# and without restriction to retrieve objects from servers.
+#
+# Author: Benjamin Renard <brenard@easter-eggs.com>
+# Source: https://gitea.zionetrix.net/bn8/check_syncrepl_extended
+# License: GPLv3
+#
+
+import argparse
+import logging
+import sys
+
+import getpass
+
+import ldap
+from ldap import LDAPError  # pylint: disable=no-name-in-module
+from ldap.controls import SimplePagedResultsControl
+from ldap import modlist
+
+VERSION = '0.0'
+TOUCH_VALUE = b'%%TOUCH%%'
+
+parser = argparse.ArgumentParser(
+    description=(
+        "Script to check LDAP syncrepl replication state between "
+        "two servers."),
+    epilog=(
+        'Author: Benjamin Renard <brenard@easter-eggs.com>, '
+        f'Version: {VERSION}, '
+        'Source: https://gitea.zionetrix.net/bn8/check_syncrepl_extended')
+)
+
+parser.add_argument(
+    "-p", "--provider",
+    dest="provider",
+    action="store",
+    type=str,
+    help="LDAP provider URI (example: ldaps://ldapmaster.foo:636)"
+)
+
+parser.add_argument(
+    "-c", "--consumer",
+    dest="consumer",
+    action="store",
+    type=str,
+    help="LDAP consumer URI (example: ldaps://ldapslave.foo:636)"
+)
+
+parser.add_argument(
+    "-i", "--serverID",
+    dest="serverid",
+    action="store",
+    type=int,
+    help=(
+        "Compare contextCSN of a specific master. Useful in MultiMaster "
+        "setups where each master has a unique ID and a contextCSN for "
+        "each replicated master exists. A valid serverID is a integer "
+        "value from 0 to 4095 (limited to 3 hex digits, example: '12' "
+        "compares the contextCSN matching '#00C#')"),
+    default=False
+)
+
+parser.add_argument(
+    "-T", "--starttls",
+    dest="starttls",
+    action="store_true",
+    help="Start TLS on LDAP provider/consumers connections",
+    default=False
+)
+
+parser.add_argument(
+    "-D", "--dn",
+    dest="dn",
+    action="store",
+    type=str,
+    help="LDAP bind DN (example: uid=nagios,ou=sysaccounts,o=example"
+)
+
+parser.add_argument(
+    "-P", "--pwd",
+    dest="pwd",
+    action="store",
+    type=str,
+    help="LDAP bind password",
+    default=None
+)
+
+parser.add_argument(
+    "-b", "--basedn",
+    dest="basedn",
+    action="store",
+    type=str,
+    help="LDAP base DN (example: o=example)"
+)
+
+parser.add_argument(
+    "-f", "--filter",
+    dest="filterstr",
+    action="store",
+    type=str,
+    help="LDAP filter (default: (objectClass=*))",
+    default='(objectClass=*)'
+)
+
+parser.add_argument(
+    "-d", "--debug",
+    dest="debug",
+    action="store_true",
+    help="Debug mode",
+    default=False
+)
+
+parser.add_argument(
+    "-n", "--nagios",
+    dest="nagios",
+    action="store_true",
+    help="Nagios check plugin mode",
+    default=False
+)
+
+parser.add_argument(
+    "-q", "--quiet",
+    dest="quiet",
+    action="store_true",
+    help="Quiet mode",
+    default=False
+)
+
+parser.add_argument(
+    "--no-check-certificate",
+    dest="nocheckcert",
+    action="store_true",
+    help="Don't check the server certificate (Default: False)",
+    default=False
+)
+
+parser.add_argument(
+    "--no-check-contextCSN",
+    dest="nocheckcontextcsn",
+    action="store_true",
+    help="Don't check servers contextCSN (Default: False)",
+    default=False
+)
+
+parser.add_argument(
+    "--only-check-contextCSN",
+    dest="onlycheckcontextcsn",
+    action="store_true",
+    help=(
+        "Only check servers root contextCSN (objects check disabled, "
+        "default : False)"),
+    default=False
+)
+
+parser.add_argument(
+    "-a", "--attributes",
+    dest="attrs",
+    action="store_true",
+    help="Check attributes values (Default: check only entryCSN)",
+    default=False
+)
+
+parser.add_argument(
+    "--exclude-attributes",
+    dest="excl_attrs",
+    action="store",
+    type=str,
+    help="Don't check this attribut (only in attribute check mode)",
+    default=None
+)
+
+parser.add_argument(
+    "--touch",
+    dest="touch",
+    action="store",
+    type=str,
+    help=(
+        'Touch attribute giving in parameter to force resync a this LDAP '
+        f'object from provider. A value "{TOUCH_VALUE.decode()}" will be '
+        'add to this attribute and remove after. The user use to connect '
+        'to the LDAP directory must have write permission on this '
+        'attribute on each object.'
+    ),
+    default=None
+)
+
+parser.add_argument(
+    "--replace-touch",
+    dest="replacetouch",
+    action="store_true",
+    help="In touch mode, replace value instead of adding.",
+    default=False
+)
+
+parser.add_argument(
+    "--remove-touch-value",
+    dest="removetouchvalue",
+    action="store_true",
+    help="In touch mode, remove touch value if present.",
+    default=False
+)
+
+parser.add_argument(
+    "--page-size",
+    dest="page_size",
+    action="store",
+    type=int,
+    help=(
+        "Page size: if defined, paging control using LDAP v3 extended "
+        "control will be enabled."),
+    default=None
+)
+
+options = parser.parse_args()
+
+if options.nocheckcontextcsn and options.onlycheckcontextcsn:
+    parser.error(
+        "You can't use both --no-check-contextCSN and "
+        "--only-check-contextCSN parameters and the same time")
+    if options.nagios:
+        sys.exit(3)
+    sys.exit(1)
+
+if not options.provider or not options.consumer:
+    parser.error("You must provide provider and customer URI")
+    if options.nagios:
+        sys.exit(3)
+    sys.exit(1)
+
+if not options.basedn:
+    parser.error("You must provide base DN of connection to LDAP servers")
+    if options.nagios:
+        sys.exit(3)
+    sys.exit(1)
+
+
+if not 0 <= options.serverid <= 4095:
+    parser.error(
+        "ServerID should be a integer value from 0 to 4095 "
+        "(limited to 3 hexadecimal digits).")
+    if options.nagios:
+        sys.exit(3)
+    sys.exit(1)
+
+if options.touch and not options.attrs:
+    logging.info('Force option attrs on touch mode')
+    options.attrs = True
+
+if options.dn and options.pwd is None:
+    options.pwd = getpass.getpass()
+
+excl_attrs = []
+if options.excl_attrs:
+    for ex in options.excl_attrs.split(','):
+        excl_attrs.append(ex.strip())
+
+FORMAT = "%(asctime)s - %(levelname)s: %(message)s"
+
+if options.debug:
+    logging.basicConfig(level=logging.DEBUG, format=FORMAT)
+    ldap.set_option(ldap.OPT_DEBUG_LEVEL, 0)  # pylint: disable=no-member
+elif options.nagios:
+    logging.basicConfig(level=logging.ERROR, format=FORMAT)
+elif options.quiet:
+    logging.basicConfig(level=logging.WARNING, format=FORMAT)
+else:
+    logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+class LdapServer:
+
+    uri = ""
+    dn = ""
+    pwd = ""
+    start_tls = False
+
+    con = 0
+
+    def __init__(self, uri, dn, pwd, start_tls=False, page_size=None):
+        self.uri = uri
+        self.dn = dn
+        self.pwd = pwd
+        self.start_tls = start_tls
+        self.page_size = page_size
+
+    def connect(self):
+        if self.con == 0:
+            try:
+                con = ldap.initialize(self.uri)
+                # pylint: disable=no-member
+                con.protocol_version = ldap.VERSION3
+                if self.start_tls:
+                    con.start_tls_s()
+                if self.dn:
+                    con.simple_bind_s(self.dn, self.pwd)
+                self.con = con
+            except LDAPError:
+                logging.error("LDAP Error", exc_info=True)
+                return False
+        return True
+
+    def getContextCSN(self, basedn=False, serverid=False):
+        if not basedn:
+            basedn = self.dn
+        data = self.search(
+            basedn, '(objectclass=*)', attrs=['contextCSN'], scope='base')
+        if data:
+            contextCSNs = data[0][0][1]['contextCSN']
+            logging.debug('Found contextCSNs %s', contextCSNs)
+            if serverid is False:
+                return contextCSNs[0]
+            csnid = str(format(serverid, 'X')).zfill(3)
+            sub = str.encode(f'#{csnid}#', encoding="ascii", errors="replace")
+            CSN = [s for s in contextCSNs if sub in s]
+            if not CSN:
+                logging.error(
+                    "No contextCSN matching with ServerID %s (=%s) could be "
+                    "found.",
+                    serverid, sub
+                )
+                return False
+            return CSN[0]
+        return False
+
+    @staticmethod
+    def get_scope(scope):
+        if scope == 'base':
+            return ldap.SCOPE_BASE  # pylint: disable=no-member
+        if scope == 'one':
+            return ldap.SCOPE_ONELEVEL  # pylint: disable=no-member
+        if scope == 'sub':
+            return ldap.SCOPE_SUBTREE  # pylint: disable=no-member
+        raise Exception(f'Unknown LDAP scope "{scope}"')
+
+    def search(self, basedn, filterstr, attrs=None, scope=None):
+        if self.page_size:
+            return self.paged_search(
+                basedn, filterstr, attrs=attrs, scope=scope)
+        res_id = self.con.search(
+            basedn, self.get_scope(scope if scope else 'sub'),
+            filterstr, attrs if attrs else []
+        )
+        ret = []
+        while 1:
+            res_type, res_data = self.con.result(res_id, 0)
+            if res_data == []:
+                break
+            if res_type == ldap.RES_SEARCH_ENTRY:  # pylint: disable=no-member
+                ret.append(res_data)
+        return ret
+
+    def paged_search(self, basedn, filterstr, attrs=None, scope=None):
+        ret = []
+        page = 0
+        pg_ctrl = SimplePagedResultsControl(True, self.page_size, '')
+        while page == 0 or pg_ctrl.cookie:
+            page += 1
+            logging.debug('Page search: loading page %d', page)
+            res_id = self.con.search_ext(
+                basedn, self.get_scope(scope if scope else 'sub'),
+                filterstr, attrs if attrs else [], serverctrls=[pg_ctrl]
+            )
+            # pylint: disable=unused-variable
+            res_type, res_data, res_id, serverctrls = self.con.result3(res_id)
+            for serverctrl in serverctrls:
+                if serverctrl.controlType == SimplePagedResultsControl.controlType:
+                    pg_ctrl.cookie = serverctrl.cookie
+                    break
+            for item in res_data:
+                ret.append([item])
+        return ret
+
+    def update_object(self, dn, old, new):
+        ldif = modlist.modifyModlist(old, new)
+        if not ldif:
+            return True
+        try:
+            logging.debug('Update object %s: %s', dn, ldif)
+            self.con.modify_s(dn, ldif)
+            return True
+        except LDAPError:
+            logging.error('Error updating object %s', dn, exc_info=True)
+        return False
+
+    @staticmethod
+    def get_attr(obj, attr):
+        if attr in obj[0][1]:
+            return obj[0][1][attr]
+        return []
+
+    def touch_object(self, dn, attr, orig_value):
+        old = {}
+        if orig_value:
+            old[attr] = orig_value
+        new = {}
+
+        if options.replacetouch:
+            if not orig_value or TOUCH_VALUE not in orig_value:
+                new[attr] = [TOUCH_VALUE]
+        else:
+            new[attr] = list(orig_value)
+            if orig_value or TOUCH_VALUE in orig_value:
+                new[attr].remove(TOUCH_VALUE)
+            else:
+                new[attr].append(TOUCH_VALUE)
+        try:
+            logging.info(
+                'Touch object "%s" on attribute "%s": %s => %s',
+                dn, attr, old, new
+            )
+            if self.update_object(dn, old, new):
+                logging.info(
+                    'Restore original value of attribute "%s" of object "%s"',
+                    attr, dn)
+                if options.removetouchvalue and TOUCH_VALUE in old[attr]:
+                    old[attr].remove(TOUCH_VALUE)
+                self.update_object(dn=dn, old=new, new=old)
+                return True
+        except LDAPError:
+            logging.error('Error touching object "%s"', dn, exc_info=True)
+        return False
+
+
+if options.nocheckcert:
+    # pylint: disable=no-member
+    ldap.set_option(
+        ldap.OPT_X_TLS_REQUIRE_CERT, ldap.OPT_X_TLS_NEVER)
+
+servers = [options.provider, options.consumer]
+
+LdapServers = {}
+LdapObjects = {}
+LdapServersCSN = {}
+
+for srv in servers:
+    logging.info('Connect to %s', srv)
+    LdapServers[srv] = LdapServer(srv, options.dn, options.pwd,
+                                  options.starttls,
+                                  page_size=options.page_size)
+
+    if not LdapServers[srv].connect():
+        if options.nagios:
+            print(f'UNKWNON - Failed to connect to {srv}')
+            sys.exit(3)
+        else:
+            sys.exit(1)
+
+    if not options.nocheckcontextcsn:
+        LdapServersCSN[srv] = LdapServers[srv].getContextCSN(
+            options.basedn, options.serverid)
+        logging.info('ContextCSN of %s: %s', srv, LdapServersCSN[srv])
+
+    if not options.onlycheckcontextcsn:
+        logging.info('List objects from %s', srv)
+        LdapObjects[srv] = {}
+
+        if options.attrs:
+            for obj in LdapServers[srv].search(
+                options.basedn, options.filterstr, []
+            ):
+                logging.debug('Found on %s: %s', srv, obj[0][0])
+                LdapObjects[srv][obj[0][0]] = obj[0][1]
+        else:
+            for obj in LdapServers[srv].search(
+                options.basedn, options.filterstr, ['entryCSN']
+            ):
+                logging.debug(
+                    'Found on %s: %s / %s',
+                    srv, obj[0][0], obj[0][1]['entryCSN'][0]
+                )
+                LdapObjects[srv][obj[0][0]] = obj[0][1]['entryCSN'][0]
+
+        logging.info('%s objects founds', len(LdapObjects[srv]))
+
+
+if not options.onlycheckcontextcsn:
+    not_found = {}
+    not_sync = {}
+
+    for srv in servers:
+        not_found[srv] = []
+        not_sync[srv] = []
+
+    if options.attrs:
+        logging.info(
+            "Check if objects a are synchronized (by comparing attributes's "
+            "values)")
+    else:
+        logging.info(
+            'Check if objets are synchronized (by comparing entryCSN)')
+    for obj in LdapObjects[options.provider]:
+        logging.debug('Check obj %s', obj)
+        for srv_name, srv in LdapObjects.items():
+            if srv_name == options.provider:
+                continue
+            if obj in srv:
+                touch = False
+                if LdapObjects[options.provider][obj] != srv[obj]:
+                    if options.attrs:
+                        attrs_list = []
+                        for attr in LdapObjects[options.provider][obj]:
+                            if attr in excl_attrs:
+                                continue
+                            if attr not in srv[obj]:
+                                attrs_list.append(attr)
+                                logging.debug(
+                                    "Obj %s not synchronized: %s not present on %s",
+                                    obj, ','.join(attrs_list), srv_name
+                                )
+                                touch = True
+                            else:
+                                srv[obj][attr].sort()
+                                LdapObjects[options.provider][obj][attr].sort()
+                                if srv[obj][attr] != LdapObjects[options.provider][obj][attr]:
+                                    attrs_list.append(attr)
+                                    logging.debug(
+                                        "Obj %s not synchronized: %s not same value(s)",
+                                        obj, ','.join(attrs_list)
+                                    )
+                                    touch = True
+                        if attrs_list:
+                            not_sync[srv_name].append(f'{obj} ({",".join(attrs_list)})')
+                    else:
+                        logging.debug(
+                            "Obj %s not synchronized: %s <-> %s",
+                            obj, LdapObjects[options.provider][obj], srv[obj]
+                        )
+                        not_sync[srv_name].append(obj)
+                if touch and options.touch:
+                    orig_value = []
+                    if options.touch in LdapObjects[options.provider][obj]:
+                        orig_value = LdapObjects[options.provider][obj][options.touch]
+                    LdapServers[options.provider].touch_object(
+                        obj, options.touch, orig_value)
+            else:
+                logging.debug('Obj %s: not found on %s', obj, srv_name)
+                not_found[srv_name].append(obj)
+                if options.touch:
+                    orig_value = []
+                    if options.touch in LdapObjects[options.provider][obj]:
+                        orig_value = LdapObjects[options.provider][obj][options.touch]
+                    LdapServers[options.provider].touch_object(
+                        obj, options.touch, orig_value)
+
+    for obj in LdapObjects[options.consumer]:
+        logging.debug('Check obj %s of consumer', obj)
+        if obj not in LdapObjects[options.provider]:
+            logging.debug('Obj %s: not found on provider', obj)
+            not_found[options.provider].append(obj)
+
+if options.nagios:
+    errors = []
+    long_output = []
+
+    if not options.nocheckcontextcsn:
+        if not LdapServersCSN[options.provider]:
+            errors.append('ContextCSN of LDAP server provider could not be found')
+        else:
+            long_output.append(
+                f'ContextCSN on LDAP server provider = {LdapServersCSN[options.provider]}')
+            for srv_name, srv_csn in LdapServersCSN.items():
+                if srv_name == options.provider:
+                    continue
+                if not srv_csn:
+                    errors.append(f'ContextCSN of {srv_name} not found')
+                elif srv_csn != LdapServersCSN[options.provider]:
+                    errors.append(
+                        f'ContextCSN of {srv_name} not the same of provider')
+                    long_output.append(
+                        f'ContextCSN on LDAP server {srv_name} = {srv_csn}')
+
+    if not options.onlycheckcontextcsn:
+        if not_found[options.consumer]:
+            errors.append(
+                f'{len(not_found[options.consumer])} not found object(s) on '
+                'consumer')
+            long_output.append(
+                f'Object(s) not found on server {options.consumer} '
+                '(consumer):')
+            for obj in not_found[options.consumer]:
+                long_output.append(f' - {obj}')
+        if not_found[options.provider]:
+            errors.append(
+                f'{len(not_found[options.provider])} not found object(s) on '
+                'provider')
+            long_output.append(
+                f'Object(s) not found on server {options.provider} '
+                '(provider):')
+            for obj in not_found[options.provider]:
+                long_output.append(f' - {obj}')
+        if not_sync[options.consumer]:
+            errors.append(
+                f'{len(not_sync[options.consumer])} not synchronized object(s) '
+                'on consumer')
+            long_output.append(
+                f'Object(s) not synchronized on server {options.consumer} '
+                '(consumer):')
+            for obj in not_sync[options.consumer]:
+                long_output.append(f' - {obj}')
+    if errors:
+        print(f'CRITICAL: {", ".join(errors)}')
+        print('\n\n')
+        print("\n".join(long_output))
+        sys.exit(2)
+    else:
+        print('OK: consumer and provider are synchronized')
+        sys.exit(0)
+else:
+    noerror = True
+    for srv in servers:
+        if not options.nocheckcontextcsn:
+            if not LdapServersCSN[options.provider]:
+                logging.warning(
+                    'ContextCSN of LDAP server provider could not be found')
+                noerror = False
+            else:
+                for srv_name, srv_csn in LdapServersCSN.items():
+                    if srv_name == options.provider:
+                        continue
+                    if not srv_csn:
+                        logging.warning('ContextCSN of %s not found', srv_name)
+                        noerror = False
+                    elif srv_csn != LdapServersCSN[options.provider]:
+                        logging.warning(
+                            'ContextCSN of %s not the same of provider',
+                            srv_name)
+                        noerror = False
+
+        if not options.onlycheckcontextcsn:
+            if not_found[srv]:
+                logging.warning(
+                    'Not found objects on %s :\n  - %s',
+                    srv, '\n  - '.join(not_found[srv])
+                )
+                noerror = False
+            if not_sync[srv]:
+                logging.warning(
+                    'Not sync objects on %s: %s',
+                    srv, '\n  - '.join(not_sync[srv])
+                )
+                noerror = False
+
+    if noerror:
+        logging.info('No sync problem detected')

+ 14 - 0
icinga2/services/ldap.conf

@@ -28,3 +28,17 @@ apply Service "ldap_replication" {
 
 	assign where host.vars.ldap_fqdn && "ldap-replica" in host.vars.roles
 }
+
+apply Service "ldap_syncrepl_extended" {
+        import "generic-service"
+
+	check_command = "syncrepl_extended"
+
+	vars.provider = "ldaps://ldap-master.srv.in.ffho.net"
+	vars.consumer = "ldaps://" + host.vars.ldap_fqdn
+	vars.base_dn = "dc=ffho,dc=net"
+	vars.bind_dn = "uid=sync-check,ou=accounts,dc=ffho,dc=net"
+	vars.bind_password = LdapSyncReplBindPassword
+
+	assign where host.vars.ldap_fqdn && "ldap-replica" in host.vars.roles
+}